diff options
-rw-r--r-- | chunk.go | 2 | ||||
-rw-r--r-- | repo.go | 53 | ||||
-rw-r--r-- | repo_test.go | 10 | ||||
-rw-r--r-- | sketch.go | 16 | ||||
-rw-r--r-- | sketch_test.go | 2 |
5 files changed, 59 insertions, 24 deletions
@@ -45,9 +45,11 @@ func (c *Chunk) Read(buff []byte) (int, error) { func (c *Chunk) Reader() (ChunkReader, error) { if c.Value != nil { + log.Printf("Chunk %d: Reading from in-memory value\n", c.Id) return bytes.NewReader(c.Value), nil } if c.Id != nil { + log.Printf("Chunk %d: Reading from file\n", c.Id) return c.Id.Reader(c.Repo.path), nil } return nil, &ChunkError{"Uninitialized chunk"} @@ -39,6 +39,9 @@ import ( "github.com/chmduquesne/rollinghash/rabinkarp64" ) +type FingerprintMap map[uint64]*ChunkId +type SketchMap map[uint64][]*ChunkId + type Repo struct { path string } @@ -66,8 +69,8 @@ func (r *Repo) Commit(source string) { files := listFiles(source) go r.loadChunks(versions, oldChunks) go concatFiles(files, writer) - hashes := hashChunks(oldChunks) - chunks := r.matchStream(reader, hashes) + fingerprints, _ := hashChunks(oldChunks) + chunks := r.matchStream(reader, fingerprints) extractNewChunks(chunks) // storeChunks(newChunkPath, newChunks) // storeFiles(newFilesPath, files) @@ -163,12 +166,6 @@ func loadFileList(path string) []File { return files } -func printChunks(chunks <-chan []byte) { - for c := range chunks { - fmt.Println(c) - } -} - func storeChunks(dest string, chunks <-chan []byte) { i := 0 for c := range chunks { @@ -211,16 +208,38 @@ func (r *Repo) loadChunks(versions []string, chunks chan<- Chunk) { close(chunks) } -func hashChunks(chunks <-chan Chunk) map[uint64]ChunkId { - hashes := make(map[uint64]ChunkId) +// hashChunks calculates the hashes for a channel of chunks. +// +// For each chunk, both a fingerprint (hash over the full content) and a sketch +// (resemblance hash based on maximal values of regions) are calculated and +// stored in an hashmap which are then returned. +func hashChunks(chunks <-chan Chunk) (FingerprintMap, SketchMap) { + fingerprints := make(FingerprintMap) + sketches := make(SketchMap) hasher := hash.Hash64(rabinkarp64.New()) for c := range chunks { hasher.Reset() hasher.Write(c.Value) h := hasher.Sum64() - hashes[h] = *c.Id + fingerprints[h] = c.Id + sketch, _ := SketchChunk(c, 32, 3, 4) + for _, s := range sketch { + sketches[s] = append(sketches[s], c.Id) + } + } + return fingerprints, sketches +} + +func findSimilarChunks(chunks []Chunk, sketches SketchMap) { + for _, c := range chunks { + sketch, _ := SketchChunk(c, 32, 3, 4) + for _, s := range sketch { + chunkId, exists := sketches[s] + if exists { + log.Println("Found similar chunks: ", chunkId) + } + } } - return hashes } func readChunk(stream io.Reader) ([]byte, error) { @@ -229,7 +248,7 @@ func readChunk(stream io.Reader) ([]byte, error) { return buff, err } -func (r *Repo) matchStream(stream io.Reader, hashes map[uint64]ChunkId) []Chunk { +func (r *Repo) matchStream(stream io.Reader, fingerprints FingerprintMap) []Chunk { var b byte var chunks []Chunk bufStream := bufio.NewReaderSize(stream, chunkSize) @@ -240,17 +259,17 @@ func (r *Repo) matchStream(stream io.Reader, hashes map[uint64]ChunkId) []Chunk } hasher := rabinkarp64.New() hasher.Write(buff) - buff = buff[0:0] + buff = make([]byte, 0, chunkSize) for err != io.EOF { h := hasher.Sum64() - chunkId, exists := hashes[h] + chunkId, exists := fingerprints[h] if exists { if len(buff) > 0 { log.Printf("Add new partial chunk of size: %d\n", len(buff)) - chunks = append(chunks, Chunk{Repo: r, Value: buff}) + chunks = append(chunks, Chunk{Repo: r, Value: buff[:chunkSize]}) } log.Printf("Add existing chunk: %d\n", chunkId) - chunks = append(chunks, Chunk{Repo: r, Id: &chunkId}) + chunks = append(chunks, Chunk{Repo: r, Id: chunkId}) buff = make([]byte, 0, chunkSize) for i := 0; i < chunkSize && err == nil; i++ { b, err = bufStream.ReadByte() diff --git a/repo_test.go b/repo_test.go index 8c07c5b..d9c938d 100644 --- a/repo_test.go +++ b/repo_test.go @@ -4,6 +4,7 @@ import ( "bytes" "io" "io/ioutil" + "log" "os" "path" "testing" @@ -179,8 +180,8 @@ func TestBsdiff(t *testing.T) { versions := repo.loadVersions() go repo.loadChunks(versions, oldChunks) go concatFiles(files, writer) - hashes := hashChunks(oldChunks) - recipe := repo.matchStream(reader, hashes) + fingerprints, sketches := hashChunks(oldChunks) + recipe := repo.matchStream(reader, fingerprints) buff := new(bytes.Buffer) r2, _ := recipe[2].Reader() r0, _ := recipe[0].Reader() @@ -191,6 +192,11 @@ func TestBsdiff(t *testing.T) { if len(buff.Bytes()) >= chunkSize { t.Errorf("Bsdiff of chunk is too large: %d", len(buff.Bytes())) } + newChunks := extractNewChunks(recipe) + log.Println("Checking new chunks:", len(newChunks[0])) + for _, c := range newChunks { + findSimilarChunks(c, sketches) + } os.Remove(addedFile) } @@ -3,18 +3,23 @@ package main import ( "encoding/binary" "io" + "log" "github.com/chmduquesne/rollinghash/rabinkarp64" ) +type Sketch []uint64 + +const fBytes = 8 + // SketchChunk produces a sketch for a chunk based on wSize: the window size, // sfCount: the number of super-features, and fCount: the number of feature // per super-feature -func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, error) { +func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) (Sketch, error) { var fSize = chunkSize / (sfCount * fCount) superfeatures := make([]uint64, 0, sfCount) features := make([]uint64, 0, fCount) - buff := make([]byte, 8*fCount) + buff := make([]byte, fBytes*fCount) r, err := chunk.Reader() if err != nil { return nil, err @@ -24,7 +29,10 @@ func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, err features = features[:0] for f := 0; f < fCount; f++ { hasher.Reset() - io.CopyN(hasher, r, int64(wSize)) + n, err := io.CopyN(hasher, r, int64(wSize)) + if err != nil { + log.Println(n, err) + } max := hasher.Sum64() for w := 0; w < fSize-wSize; w++ { b, _ := r.ReadByte() @@ -37,7 +45,7 @@ func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, err features = append(features, max) } for i, f := range features { - binary.LittleEndian.PutUint64(buff[i*8:i*8+8], f) + binary.LittleEndian.PutUint64(buff[i*fBytes:(i+1)*fBytes], f) } hasher.Reset() hasher.Write(buff) diff --git a/sketch_test.go b/sketch_test.go index ac4ab70..2f568e6 100644 --- a/sketch_test.go +++ b/sketch_test.go @@ -20,7 +20,7 @@ func TestSketchChunk(t *testing.T) { if err != nil { t.Error(err) } - expected := []uint64{429857165471867, 6595034117354675, 8697818304802825} + expected := Sketch{429857165471867, 6595034117354675, 8697818304802825} if !cmp.Equal(sketch, expected) { t.Errorf("Sketch does not match, expected: %d, actual: %d", expected, sketch) } |