aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--TODO.md1
-rw-r--r--repo.go52
-rw-r--r--repo_test.go4
3 files changed, 29 insertions, 28 deletions
diff --git a/TODO.md b/TODO.md
index 0f5e88d..d4873aa 100644
--- a/TODO.md
+++ b/TODO.md
@@ -22,4 +22,3 @@ priority 2
----------
- use more the `Reader` API (which is analoguous to the `IOStream` in Java)
- refactor matchStream as right now it is quite complex
-- store sketches and fingerprint in Repo
diff --git a/repo.go b/repo.go
index c077dff..7fa4625 100644
--- a/repo.go
+++ b/repo.go
@@ -51,6 +51,8 @@ type Repo struct {
sketchFCount int
differ Differ
patcher Patcher
+ fingerprints FingerprintMap
+ sketches SketchMap
}
type File struct {
@@ -68,6 +70,8 @@ func NewRepo(path string) *Repo {
sketchFCount: 4,
differ: &Bsdiff{},
patcher: &Bsdiff{},
+ fingerprints: make(FingerprintMap),
+ sketches: make(SketchMap),
}
}
@@ -92,8 +96,8 @@ func (r *Repo) Commit(source string) {
files := listFiles(source)
go r.loadChunks(versions, oldChunks)
go concatFiles(files, writer)
- fingerprints, sketches := r.hashChunks(oldChunks)
- chunks := r.matchStream(reader, fingerprints, sketches)
+ r.hashChunks(oldChunks)
+ chunks := r.matchStream(reader)
extractTempChunks(chunks)
// storeChunks(newChunkPath, newChunks)
// storeFiles(newFilesPath, files)
@@ -238,26 +242,24 @@ func (r *Repo) loadChunks(versions []string, chunks chan<- StoredChunk) {
//
// For each chunk, both a fingerprint (hash over the full content) and a sketch
// (resemblance hash based on maximal values of regions) are calculated and
-// stored in an hashmap which are then returned.
-func (r *Repo) hashChunks(chunks <-chan StoredChunk) (FingerprintMap, SketchMap) {
- fingerprints := make(FingerprintMap)
- sketches := make(SketchMap)
+// stored in an hashmap.
+func (r *Repo) hashChunks(chunks <-chan StoredChunk) {
hasher := hash.Hash64(rabinkarp64.New())
for c := range chunks {
hasher.Reset()
io.Copy(hasher, c.Reader())
h := hasher.Sum64()
- fingerprints[h] = c.Id()
+ r.fingerprints[h] = c.Id()
sketch, _ := SketchChunk(c, r.chunkSize, r.sketchWSize, r.sketchSfCount, r.sketchFCount)
for _, s := range sketch {
- prev := sketches[s]
+ prev := r.sketches[s]
if contains(prev, c.Id()) {
continue
}
- sketches[s] = append(prev, c.Id())
+ r.sketches[s] = append(prev, c.Id())
}
}
- return fingerprints, sketches
+ return
}
func contains(s []*ChunkId, id *ChunkId) bool {
@@ -269,13 +271,13 @@ func contains(s []*ChunkId, id *ChunkId) bool {
return false
}
-func (r *Repo) findSimilarChunk(chunk Chunk, sketches SketchMap) (*ChunkId, bool) {
+func (r *Repo) findSimilarChunk(chunk Chunk) (*ChunkId, bool) {
var similarChunks = make(map[ChunkId]int)
var max int
var similarChunk *ChunkId
sketch, _ := SketchChunk(chunk, r.chunkSize, r.sketchWSize, r.sketchSfCount, r.sketchFCount)
for _, s := range sketch {
- chunkIds, exists := sketches[s]
+ chunkIds, exists := r.sketches[s]
if !exists {
continue
}
@@ -292,8 +294,8 @@ func (r *Repo) findSimilarChunk(chunk Chunk, sketches SketchMap) (*ChunkId, bool
return similarChunk, similarChunk != nil
}
-func (r *Repo) tryDeltaEncodeChunk(temp *TempChunk, sketches SketchMap) (Chunk, bool) {
- id, found := r.findSimilarChunk(temp, sketches)
+func (r *Repo) tryDeltaEncodeChunk(temp *TempChunk) (Chunk, bool) {
+ id, found := r.findSimilarChunk(temp)
if found {
var buff bytes.Buffer
if err := r.differ.Diff(id.Reader(r), temp.Reader(), &buff); err != nil {
@@ -311,25 +313,25 @@ func (r *Repo) tryDeltaEncodeChunk(temp *TempChunk, sketches SketchMap) (Chunk,
return temp, false
}
-func (r *Repo) tryDeltaEncodeChunks(prev *TempChunk, curr *TempChunk, sketches SketchMap) []Chunk {
+func (r *Repo) tryDeltaEncodeChunks(prev *TempChunk, curr *TempChunk) []Chunk {
if prev == nil {
- c, _ := r.tryDeltaEncodeChunk(curr, sketches)
+ c, _ := r.tryDeltaEncodeChunk(curr)
return []Chunk{c}
} else if curr.Len() < r.chunkMinLen() {
- c, success := r.tryDeltaEncodeChunk(NewTempChunk(append(prev.Bytes(), curr.Bytes()...)), sketches)
+ c, success := r.tryDeltaEncodeChunk(NewTempChunk(append(prev.Bytes(), curr.Bytes()...)))
if success {
return []Chunk{c}
} else {
return []Chunk{prev, curr}
}
} else {
- prevD, _ := r.tryDeltaEncodeChunk(prev, sketches)
- currD, _ := r.tryDeltaEncodeChunk(curr, sketches)
+ prevD, _ := r.tryDeltaEncodeChunk(prev)
+ currD, _ := r.tryDeltaEncodeChunk(curr)
return []Chunk{prevD, currD}
}
}
-func (r *Repo) matchStream(stream io.Reader, fingerprints FingerprintMap, sketches SketchMap) []Chunk {
+func (r *Repo) matchStream(stream io.Reader) []Chunk {
var b byte
var chunks []Chunk
var prev *TempChunk
@@ -344,16 +346,16 @@ func (r *Repo) matchStream(stream io.Reader, fingerprints FingerprintMap, sketch
hasher.Write(buff[:n])
for err != io.EOF {
h := hasher.Sum64()
- chunkId, exists := fingerprints[h]
+ chunkId, exists := r.fingerprints[h]
if exists {
if len(buff) > r.chunkSize && len(buff) < r.chunkSize*2 {
size := len(buff) - r.chunkSize
log.Println("Add new partial chunk of size:", size)
temp := NewTempChunk(buff[:size])
- chunks = append(chunks, r.tryDeltaEncodeChunks(prev, temp, sketches)...)
+ chunks = append(chunks, r.tryDeltaEncodeChunks(prev, temp)...)
prev = nil
} else if prev != nil {
- c, _ := r.tryDeltaEncodeChunk(prev, sketches)
+ c, _ := r.tryDeltaEncodeChunk(prev)
chunks = append(chunks, c)
prev = nil
}
@@ -370,7 +372,7 @@ func (r *Repo) matchStream(stream io.Reader, fingerprints FingerprintMap, sketch
if len(buff) == r.chunkSize*2 {
log.Println("Add new chunk")
if prev != nil {
- chunk, _ := r.tryDeltaEncodeChunk(prev, sketches)
+ chunk, _ := r.tryDeltaEncodeChunk(prev)
chunks = append(chunks, chunk)
}
prev = NewTempChunk(buff[:r.chunkSize])
@@ -395,7 +397,7 @@ func (r *Repo) matchStream(stream io.Reader, fingerprints FingerprintMap, sketch
log.Println("Add new partial chunk of size:", len(buff))
temp = NewTempChunk(buff)
}
- chunks = append(chunks, r.tryDeltaEncodeChunks(prev, temp, sketches)...)
+ chunks = append(chunks, r.tryDeltaEncodeChunks(prev, temp)...)
}
return chunks
}
diff --git a/repo_test.go b/repo_test.go
index b09f6ed..694c413 100644
--- a/repo_test.go
+++ b/repo_test.go
@@ -188,11 +188,11 @@ func TestBsdiff(t *testing.T) {
oldChunks := make(chan StoredChunk, 16)
versions := repo.loadVersions()
go repo.loadChunks(versions, oldChunks)
- fingerprints, sketches := repo.hashChunks(oldChunks)
+ repo.hashChunks(oldChunks)
// Read new data
reader := getDataStream(dataDir, concatFiles)
- recipe := repo.matchStream(reader, fingerprints, sketches)
+ recipe := repo.matchStream(reader)
newChunks := extractDeltaChunks(repo.mergeTempChunks(recipe))
assertLen(t, 2, newChunks, "New delta chunks:")
for _, c := range newChunks {