5 files changed, 59 insertions, 24 deletions
diff --git a/chunk.go b/chunk.go
index 0746fde..20dfe58 100644
--- a/chunk.go
+++ b/chunk.go
@@ -45,9 +45,11 @@ func (c *Chunk) Read(buff []byte) (int, error) {
 
 func (c *Chunk) Reader() (ChunkReader, error) {
 	if c.Value != nil {
+		log.Printf("Chunk %d: Reading from in-memory value\n", c.Id)
 		return bytes.NewReader(c.Value), nil
 	}
 	if c.Id != nil {
+		log.Printf("Chunk %d: Reading from file\n", c.Id)
 		return c.Id.Reader(c.Repo.path), nil
 	}
 	return nil, &ChunkError{"Uninitialized chunk"}
diff --git a/repo.go b/repo.go
index 833bfdf..0779bf0 100644
--- a/repo.go
+++ b/repo.go
@@ -39,6 +39,9 @@ import (
 	"github.com/chmduquesne/rollinghash/rabinkarp64"
 )
 
+type FingerprintMap map[uint64]*ChunkId
+type SketchMap map[uint64][]*ChunkId
+
 type Repo struct {
 	path string
 }
@@ -66,8 +69,8 @@ func (r *Repo) Commit(source string) {
 	files := listFiles(source)
 	go r.loadChunks(versions, oldChunks)
 	go concatFiles(files, writer)
-	hashes := hashChunks(oldChunks)
-	chunks := r.matchStream(reader, hashes)
+	fingerprints, _ := hashChunks(oldChunks)
+	chunks := r.matchStream(reader, fingerprints)
 	extractNewChunks(chunks)
 	// storeChunks(newChunkPath, newChunks)
 	// storeFiles(newFilesPath, files)
@@ -163,12 +166,6 @@ func loadFileList(path string) []File {
 	return files
 }
 
-func printChunks(chunks <-chan []byte) {
-	for c := range chunks {
-		fmt.Println(c)
-	}
-}
-
 func storeChunks(dest string, chunks <-chan []byte) {
 	i := 0
 	for c := range chunks {
@@ -211,16 +208,38 @@ func (r *Repo) loadChunks(versions []string, chunks chan<- Chunk) {
 	close(chunks)
 }
 
-func hashChunks(chunks <-chan Chunk) map[uint64]ChunkId {
-	hashes := make(map[uint64]ChunkId)
+// hashChunks calculates the hashes for a channel of chunks.
+//
+// For each chunk, both a fingerprint (hash over the full content) and a sketch
+// (resemblance hash based on maximal values of regions) are calculated and
+// stored in an hashmap which are then returned.
+func hashChunks(chunks <-chan Chunk) (FingerprintMap, SketchMap) {
+	fingerprints := make(FingerprintMap)
+	sketches := make(SketchMap)
 	hasher := hash.Hash64(rabinkarp64.New())
 	for c := range chunks {
 		hasher.Reset()
 		hasher.Write(c.Value)
 		h := hasher.Sum64()
-		hashes[h] = *c.Id
+		fingerprints[h] = c.Id
+		sketch, _ := SketchChunk(c, 32, 3, 4)
+		for _, s := range sketch {
+			sketches[s] = append(sketches[s], c.Id)
+		}
+	}
+	return fingerprints, sketches
+}
+
+func findSimilarChunks(chunks []Chunk, sketches SketchMap) {
+	for _, c := range chunks {
+		sketch, _ := SketchChunk(c, 32, 3, 4)
+		for _, s := range sketch {
+			chunkId, exists := sketches[s]
+			if exists {
+				log.Println("Found similar chunks: ", chunkId)
+			}
+		}
 	}
-	return hashes
 }
 
 func readChunk(stream io.Reader) ([]byte, error) {
@@ -229,7 +248,7 @@ func readChunk(stream io.Reader) ([]byte, error) {
 	return buff, err
 }
 
-func (r *Repo) matchStream(stream io.Reader, hashes map[uint64]ChunkId) []Chunk {
+func (r *Repo) matchStream(stream io.Reader, fingerprints FingerprintMap) []Chunk {
 	var b byte
 	var chunks []Chunk
 	bufStream := bufio.NewReaderSize(stream, chunkSize)
@@ -240,17 +259,17 @@ func (r *Repo) matchStream(stream io.Reader, hashes map[uint64]ChunkId) []Chunk
 	}
 	hasher := rabinkarp64.New()
 	hasher.Write(buff)
-	buff = buff[0:0]
+	buff = make([]byte, 0, chunkSize)
 	for err != io.EOF {
 		h := hasher.Sum64()
-		chunkId, exists := hashes[h]
+		chunkId, exists := fingerprints[h]
 		if exists {
 			if len(buff) > 0 {
 				log.Printf("Add new partial chunk of size: %d\n", len(buff))
-				chunks = append(chunks, Chunk{Repo: r, Value: buff})
+				chunks = append(chunks, Chunk{Repo: r, Value: buff[:chunkSize]})
 			}
 			log.Printf("Add existing chunk: %d\n", chunkId)
-			chunks = append(chunks, Chunk{Repo: r, Id: &chunkId})
+			chunks = append(chunks, Chunk{Repo: r, Id: chunkId})
 			buff = make([]byte, 0, chunkSize)
 			for i := 0; i < chunkSize && err == nil; i++ {
 				b, err = bufStream.ReadByte()
diff --git a/repo_test.go b/repo_test.go
index 8c07c5b..d9c938d 100644
--- a/repo_test.go
+++ b/repo_test.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"io"
 	"io/ioutil"
+	"log"
 	"os"
 	"path"
 	"testing"
@@ -179,8 +180,8 @@ func TestBsdiff(t *testing.T) {
 	versions := repo.loadVersions()
 	go repo.loadChunks(versions, oldChunks)
 	go concatFiles(files, writer)
-	hashes := hashChunks(oldChunks)
-	recipe := repo.matchStream(reader, hashes)
+	fingerprints, sketches := hashChunks(oldChunks)
+	recipe := repo.matchStream(reader, fingerprints)
 	buff := new(bytes.Buffer)
 	r2, _ := recipe[2].Reader()
 	r0, _ := recipe[0].Reader()
@@ -191,6 +192,11 @@ func TestBsdiff(t *testing.T) {
 	if len(buff.Bytes()) >= chunkSize {
 		t.Errorf("Bsdiff of chunk is too large: %d", len(buff.Bytes()))
 	}
+	newChunks := extractNewChunks(recipe)
+	log.Println("Checking new chunks:", len(newChunks[0]))
+	for _, c := range newChunks {
+		findSimilarChunks(c, sketches)
+	}
 
 	os.Remove(addedFile)
 }
diff --git a/sketch.go b/sketch.go
index c5f0838..f226661 100644
--- a/sketch.go
+++ b/sketch.go
@@ -3,18 +3,23 @@ package main
 import (
 	"encoding/binary"
 	"io"
+	"log"
 
 	"github.com/chmduquesne/rollinghash/rabinkarp64"
 )
 
+type Sketch []uint64
+
+const fBytes = 8
+
 // SketchChunk produces a sketch for a chunk based on wSize: the window size,
 // sfCount: the number of super-features, and fCount: the number of feature
 // per super-feature
-func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, error) {
+func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) (Sketch, error) {
 	var fSize = chunkSize / (sfCount * fCount)
 	superfeatures := make([]uint64, 0, sfCount)
 	features := make([]uint64, 0, fCount)
-	buff := make([]byte, 8*fCount)
+	buff := make([]byte, fBytes*fCount)
 	r, err := chunk.Reader()
 	if err != nil {
 		return nil, err
@@ -24,7 +29,10 @@ func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, err
 		features = features[:0]
 		for f := 0; f < fCount; f++ {
 			hasher.Reset()
-			io.CopyN(hasher, r, int64(wSize))
+			n, err := io.CopyN(hasher, r, int64(wSize))
+			if err != nil {
+				log.Println(n, err)
+			}
 			max := hasher.Sum64()
 			for w := 0; w < fSize-wSize; w++ {
 				b, _ := r.ReadByte()
@@ -37,7 +45,7 @@ func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, err
 			features = append(features, max)
 		}
 		for i, f := range features {
-			binary.LittleEndian.PutUint64(buff[i*8:i*8+8], f)
+			binary.LittleEndian.PutUint64(buff[i*fBytes:(i+1)*fBytes], f)
 		}
 		hasher.Reset()
 		hasher.Write(buff)
diff --git a/sketch_test.go b/sketch_test.go
index ac4ab70..2f568e6 100644
--- a/sketch_test.go
+++ b/sketch_test.go
@@ -20,7 +20,7 @@ func TestSketchChunk(t *testing.T) {
 			if err != nil {
 				t.Error(err)
 			}
-			expected := []uint64{429857165471867, 6595034117354675, 8697818304802825}
+			expected := Sketch{429857165471867, 6595034117354675, 8697818304802825}
 			if !cmp.Equal(sketch, expected) {
 				t.Errorf("Sketch does not match, expected: %d, actual: %d", expected, sketch)
 			}