aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--chunk.go2
-rw-r--r--repo.go53
-rw-r--r--repo_test.go10
-rw-r--r--sketch.go16
-rw-r--r--sketch_test.go2
5 files changed, 59 insertions, 24 deletions
diff --git a/chunk.go b/chunk.go
index 0746fde..20dfe58 100644
--- a/chunk.go
+++ b/chunk.go
@@ -45,9 +45,11 @@ func (c *Chunk) Read(buff []byte) (int, error) {
func (c *Chunk) Reader() (ChunkReader, error) {
if c.Value != nil {
+ log.Printf("Chunk %d: Reading from in-memory value\n", c.Id)
return bytes.NewReader(c.Value), nil
}
if c.Id != nil {
+ log.Printf("Chunk %d: Reading from file\n", c.Id)
return c.Id.Reader(c.Repo.path), nil
}
return nil, &ChunkError{"Uninitialized chunk"}
diff --git a/repo.go b/repo.go
index 833bfdf..0779bf0 100644
--- a/repo.go
+++ b/repo.go
@@ -39,6 +39,9 @@ import (
"github.com/chmduquesne/rollinghash/rabinkarp64"
)
+type FingerprintMap map[uint64]*ChunkId
+type SketchMap map[uint64][]*ChunkId
+
type Repo struct {
path string
}
@@ -66,8 +69,8 @@ func (r *Repo) Commit(source string) {
files := listFiles(source)
go r.loadChunks(versions, oldChunks)
go concatFiles(files, writer)
- hashes := hashChunks(oldChunks)
- chunks := r.matchStream(reader, hashes)
+ fingerprints, _ := hashChunks(oldChunks)
+ chunks := r.matchStream(reader, fingerprints)
extractNewChunks(chunks)
// storeChunks(newChunkPath, newChunks)
// storeFiles(newFilesPath, files)
@@ -163,12 +166,6 @@ func loadFileList(path string) []File {
return files
}
-func printChunks(chunks <-chan []byte) {
- for c := range chunks {
- fmt.Println(c)
- }
-}
-
func storeChunks(dest string, chunks <-chan []byte) {
i := 0
for c := range chunks {
@@ -211,16 +208,38 @@ func (r *Repo) loadChunks(versions []string, chunks chan<- Chunk) {
close(chunks)
}
-func hashChunks(chunks <-chan Chunk) map[uint64]ChunkId {
- hashes := make(map[uint64]ChunkId)
+// hashChunks calculates the hashes for a channel of chunks.
+//
+// For each chunk, both a fingerprint (hash over the full content) and a sketch
+// (resemblance hash based on maximal values of regions) are calculated and
+// stored in an hashmap which are then returned.
+func hashChunks(chunks <-chan Chunk) (FingerprintMap, SketchMap) {
+ fingerprints := make(FingerprintMap)
+ sketches := make(SketchMap)
hasher := hash.Hash64(rabinkarp64.New())
for c := range chunks {
hasher.Reset()
hasher.Write(c.Value)
h := hasher.Sum64()
- hashes[h] = *c.Id
+ fingerprints[h] = c.Id
+ sketch, _ := SketchChunk(c, 32, 3, 4)
+ for _, s := range sketch {
+ sketches[s] = append(sketches[s], c.Id)
+ }
+ }
+ return fingerprints, sketches
+}
+
+func findSimilarChunks(chunks []Chunk, sketches SketchMap) {
+ for _, c := range chunks {
+ sketch, _ := SketchChunk(c, 32, 3, 4)
+ for _, s := range sketch {
+ chunkId, exists := sketches[s]
+ if exists {
+ log.Println("Found similar chunks: ", chunkId)
+ }
+ }
}
- return hashes
}
func readChunk(stream io.Reader) ([]byte, error) {
@@ -229,7 +248,7 @@ func readChunk(stream io.Reader) ([]byte, error) {
return buff, err
}
-func (r *Repo) matchStream(stream io.Reader, hashes map[uint64]ChunkId) []Chunk {
+func (r *Repo) matchStream(stream io.Reader, fingerprints FingerprintMap) []Chunk {
var b byte
var chunks []Chunk
bufStream := bufio.NewReaderSize(stream, chunkSize)
@@ -240,17 +259,17 @@ func (r *Repo) matchStream(stream io.Reader, hashes map[uint64]ChunkId) []Chunk
}
hasher := rabinkarp64.New()
hasher.Write(buff)
- buff = buff[0:0]
+ buff = make([]byte, 0, chunkSize)
for err != io.EOF {
h := hasher.Sum64()
- chunkId, exists := hashes[h]
+ chunkId, exists := fingerprints[h]
if exists {
if len(buff) > 0 {
log.Printf("Add new partial chunk of size: %d\n", len(buff))
- chunks = append(chunks, Chunk{Repo: r, Value: buff})
+ chunks = append(chunks, Chunk{Repo: r, Value: buff[:chunkSize]})
}
log.Printf("Add existing chunk: %d\n", chunkId)
- chunks = append(chunks, Chunk{Repo: r, Id: &chunkId})
+ chunks = append(chunks, Chunk{Repo: r, Id: chunkId})
buff = make([]byte, 0, chunkSize)
for i := 0; i < chunkSize && err == nil; i++ {
b, err = bufStream.ReadByte()
diff --git a/repo_test.go b/repo_test.go
index 8c07c5b..d9c938d 100644
--- a/repo_test.go
+++ b/repo_test.go
@@ -4,6 +4,7 @@ import (
"bytes"
"io"
"io/ioutil"
+ "log"
"os"
"path"
"testing"
@@ -179,8 +180,8 @@ func TestBsdiff(t *testing.T) {
versions := repo.loadVersions()
go repo.loadChunks(versions, oldChunks)
go concatFiles(files, writer)
- hashes := hashChunks(oldChunks)
- recipe := repo.matchStream(reader, hashes)
+ fingerprints, sketches := hashChunks(oldChunks)
+ recipe := repo.matchStream(reader, fingerprints)
buff := new(bytes.Buffer)
r2, _ := recipe[2].Reader()
r0, _ := recipe[0].Reader()
@@ -191,6 +192,11 @@ func TestBsdiff(t *testing.T) {
if len(buff.Bytes()) >= chunkSize {
t.Errorf("Bsdiff of chunk is too large: %d", len(buff.Bytes()))
}
+ newChunks := extractNewChunks(recipe)
+ log.Println("Checking new chunks:", len(newChunks[0]))
+ for _, c := range newChunks {
+ findSimilarChunks(c, sketches)
+ }
os.Remove(addedFile)
}
diff --git a/sketch.go b/sketch.go
index c5f0838..f226661 100644
--- a/sketch.go
+++ b/sketch.go
@@ -3,18 +3,23 @@ package main
import (
"encoding/binary"
"io"
+ "log"
"github.com/chmduquesne/rollinghash/rabinkarp64"
)
+type Sketch []uint64
+
+const fBytes = 8
+
// SketchChunk produces a sketch for a chunk based on wSize: the window size,
// sfCount: the number of super-features, and fCount: the number of feature
// per super-feature
-func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, error) {
+func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) (Sketch, error) {
var fSize = chunkSize / (sfCount * fCount)
superfeatures := make([]uint64, 0, sfCount)
features := make([]uint64, 0, fCount)
- buff := make([]byte, 8*fCount)
+ buff := make([]byte, fBytes*fCount)
r, err := chunk.Reader()
if err != nil {
return nil, err
@@ -24,7 +29,10 @@ func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, err
features = features[:0]
for f := 0; f < fCount; f++ {
hasher.Reset()
- io.CopyN(hasher, r, int64(wSize))
+ n, err := io.CopyN(hasher, r, int64(wSize))
+ if err != nil {
+ log.Println(n, err)
+ }
max := hasher.Sum64()
for w := 0; w < fSize-wSize; w++ {
b, _ := r.ReadByte()
@@ -37,7 +45,7 @@ func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, err
features = append(features, max)
}
for i, f := range features {
- binary.LittleEndian.PutUint64(buff[i*8:i*8+8], f)
+ binary.LittleEndian.PutUint64(buff[i*fBytes:(i+1)*fBytes], f)
}
hasher.Reset()
hasher.Write(buff)
diff --git a/sketch_test.go b/sketch_test.go
index ac4ab70..2f568e6 100644
--- a/sketch_test.go
+++ b/sketch_test.go
@@ -20,7 +20,7 @@ func TestSketchChunk(t *testing.T) {
if err != nil {
t.Error(err)
}
- expected := []uint64{429857165471867, 6595034117354675, 8697818304802825}
+ expected := Sketch{429857165471867, 6595034117354675, 8697818304802825}
if !cmp.Equal(sketch, expected) {
t.Errorf("Sketch does not match, expected: %d, actual: %d", expected, sketch)
}