aboutsummaryrefslogtreecommitdiff
path: root/repo.go
diff options
context:
space:
mode:
authorn-peugnet <n.peugnet@free.fr>2021-08-27 18:38:07 +0200
committern-peugnet <n.peugnet@free.fr>2021-08-27 18:38:16 +0200
commit129a86b3a6780b7aee5a7469cc5adeaf2ea6c20f (patch)
treeab8423f6885c380b2bb4d807313428003d8d5e37 /repo.go
parent78251f11c91b2504edfc02b760ef53bd352b856c (diff)
downloaddna-backup-129a86b3a6780b7aee5a7469cc5adeaf2ea6c20f.tar.gz
dna-backup-129a86b3a6780b7aee5a7469cc5adeaf2ea6c20f.zip
add findSimilarChunks method to test sketches
Still missing a real test...
Diffstat (limited to 'repo.go')
-rw-r--r--repo.go53
1 files changed, 36 insertions, 17 deletions
diff --git a/repo.go b/repo.go
index 833bfdf..0779bf0 100644
--- a/repo.go
+++ b/repo.go
@@ -39,6 +39,9 @@ import (
"github.com/chmduquesne/rollinghash/rabinkarp64"
)
+type FingerprintMap map[uint64]*ChunkId
+type SketchMap map[uint64][]*ChunkId
+
type Repo struct {
path string
}
@@ -66,8 +69,8 @@ func (r *Repo) Commit(source string) {
files := listFiles(source)
go r.loadChunks(versions, oldChunks)
go concatFiles(files, writer)
- hashes := hashChunks(oldChunks)
- chunks := r.matchStream(reader, hashes)
+ fingerprints, _ := hashChunks(oldChunks)
+ chunks := r.matchStream(reader, fingerprints)
extractNewChunks(chunks)
// storeChunks(newChunkPath, newChunks)
// storeFiles(newFilesPath, files)
@@ -163,12 +166,6 @@ func loadFileList(path string) []File {
return files
}
-func printChunks(chunks <-chan []byte) {
- for c := range chunks {
- fmt.Println(c)
- }
-}
-
func storeChunks(dest string, chunks <-chan []byte) {
i := 0
for c := range chunks {
@@ -211,16 +208,38 @@ func (r *Repo) loadChunks(versions []string, chunks chan<- Chunk) {
close(chunks)
}
-func hashChunks(chunks <-chan Chunk) map[uint64]ChunkId {
- hashes := make(map[uint64]ChunkId)
+// hashChunks calculates the hashes for a channel of chunks.
+//
+// For each chunk, both a fingerprint (hash over the full content) and a sketch
+// (resemblance hash based on maximal values of regions) are calculated and
+// stored in an hashmap which are then returned.
+func hashChunks(chunks <-chan Chunk) (FingerprintMap, SketchMap) {
+ fingerprints := make(FingerprintMap)
+ sketches := make(SketchMap)
hasher := hash.Hash64(rabinkarp64.New())
for c := range chunks {
hasher.Reset()
hasher.Write(c.Value)
h := hasher.Sum64()
- hashes[h] = *c.Id
+ fingerprints[h] = c.Id
+ sketch, _ := SketchChunk(c, 32, 3, 4)
+ for _, s := range sketch {
+ sketches[s] = append(sketches[s], c.Id)
+ }
+ }
+ return fingerprints, sketches
+}
+
+func findSimilarChunks(chunks []Chunk, sketches SketchMap) {
+ for _, c := range chunks {
+ sketch, _ := SketchChunk(c, 32, 3, 4)
+ for _, s := range sketch {
+ chunkId, exists := sketches[s]
+ if exists {
+ log.Println("Found similar chunks: ", chunkId)
+ }
+ }
}
- return hashes
}
func readChunk(stream io.Reader) ([]byte, error) {
@@ -229,7 +248,7 @@ func readChunk(stream io.Reader) ([]byte, error) {
return buff, err
}
-func (r *Repo) matchStream(stream io.Reader, hashes map[uint64]ChunkId) []Chunk {
+func (r *Repo) matchStream(stream io.Reader, fingerprints FingerprintMap) []Chunk {
var b byte
var chunks []Chunk
bufStream := bufio.NewReaderSize(stream, chunkSize)
@@ -240,17 +259,17 @@ func (r *Repo) matchStream(stream io.Reader, hashes map[uint64]ChunkId) []Chunk
}
hasher := rabinkarp64.New()
hasher.Write(buff)
- buff = buff[0:0]
+ buff = make([]byte, 0, chunkSize)
for err != io.EOF {
h := hasher.Sum64()
- chunkId, exists := hashes[h]
+ chunkId, exists := fingerprints[h]
if exists {
if len(buff) > 0 {
log.Printf("Add new partial chunk of size: %d\n", len(buff))
- chunks = append(chunks, Chunk{Repo: r, Value: buff})
+ chunks = append(chunks, Chunk{Repo: r, Value: buff[:chunkSize]})
}
log.Printf("Add existing chunk: %d\n", chunkId)
- chunks = append(chunks, Chunk{Repo: r, Id: &chunkId})
+ chunks = append(chunks, Chunk{Repo: r, Id: chunkId})
buff = make([]byte, 0, chunkSize)
for i := 0; i < chunkSize && err == nil; i++ {
b, err = bufStream.ReadByte()