join too small temp chunks with previous one if possible

author: n-peugnet <n.peugnet@free.fr> 2021-08-31 16:28:07 +0200
committer: n-peugnet <n.peugnet@free.fr> 2021-08-31 16:38:34 +0200
commit: 504fe3db47c058807b656a8e63bb27c12420f268 (patch)
tree: 5fec35a147b3234633d237601cc49627fbedf331
parent: c481eb2b44adf50b62de3b9e3355f64973967d52 (diff)
download: dna-backup-504fe3db47c058807b656a8e63bb27c12420f268.tar.gz
dna-backup-504fe3db47c058807b656a8e63bb27c12420f268.zip
6 files changed, 82 insertions, 49 deletions
diff --git a/README.md b/README.md
index 8d90902..b5b5097 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ _Classical go_
 go build
 
 # Test
-go test
+go test -v
 
 # Run
 ./dna-backup <source-dir> <repository>
diff --git a/TODO.md b/TODO.md
index 2d7f790..23454f7 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,8 +1,7 @@
 priority 1
 ----------
-- join non-deduplicated chunks
-  - choose when and how to
 - delta encode chunks
+- match stream against chunks from itself
 - read from repo
 
 priority 2
diff --git a/chunk.go b/chunk.go
index 78e689d..092b758 100644
--- a/chunk.go
+++ b/chunk.go
@@ -107,3 +107,11 @@ func (c *TempChunk) Reader() ChunkReader {
 func (c *TempChunk) Len() int {
 	return len(c.value)
 }
+
+func (c *TempChunk) AppendFrom(r io.Reader) {
+	buff, err := io.ReadAll(r)
+	if err != nil {
+		println("Chunk: error appending to temp chunk:", err)
+	}
+	c.value = append(c.value, buff...)
+}
diff --git a/repo.go b/repo.go
index 55e830f..d32d437 100644
--- a/repo.go
+++ b/repo.go
@@ -71,7 +71,7 @@ func (r *Repo) Commit(source string) {
 	go concatFiles(files, writer)
 	fingerprints, _ := hashChunks(oldChunks)
 	chunks := r.matchStream(reader, fingerprints)
-	extractNewChunks(chunks)
+	extractTempChunks(chunks)
 	// storeChunks(newChunkPath, newChunks)
 	// storeFiles(newFilesPath, files)
 	fmt.Println(files)
@@ -327,24 +327,32 @@ func (r *Repo) matchStream(stream io.Reader, fingerprints FingerprintMap) []Chun
 	return chunks
 }
 
-// extractNewChunks extracts new chunks from an array of chunks and
-// returns them in an array of consecutive new chunk's array
-func extractNewChunks(chunks []Chunk) (ret [][]Chunk) {
-	var i int
-	ret = append(ret, nil)
+// extractTempChunks extracts temporary chunks from an array of chunks.
+// If a chunk is smaller than the size required to calculate a super-feature,
+// it is then appended to the previous consecutive temporary chunk if it exists.
+func extractTempChunks(chunks []Chunk) (ret []Chunk) {
+	var prev *TempChunk
+	var curr *TempChunk
 	for _, c := range chunks {
-		_, isTmp := c.(*TempChunk)
+		tmp, isTmp := c.(*TempChunk)
 		if !isTmp {
-			if len(ret[i]) != 0 {
-				i++
-				ret = append(ret, nil)
+			if prev != nil && curr.Len() <= SuperFeatureSize(chunkSize, sketchSfCount, sketchFCount) {
+				prev.AppendFrom(curr.Reader())
+			} else if curr != nil {
+				ret = append(ret, curr)
 			}
+			curr = nil
+			prev = nil
 		} else {
-			ret[i] = append(ret[i], c)
+			prev = curr
+			curr = tmp
+			if prev != nil {
+				ret = append(ret, prev)
+			}
 		}
 	}
-	if len(ret[i]) == 0 {
-		ret = ret[:i]
+	if curr != nil {
+		ret = append(ret, curr)
 	}
 	return
 }
diff --git a/repo_test.go b/repo_test.go
index cdd3024..134b55c 100644
--- a/repo_test.go
+++ b/repo_test.go
@@ -127,20 +127,10 @@ func TestExtractNewChunks(t *testing.T) {
 		&TempChunk{value: []byte{'c'}},
 		&LoadedChunk{id: &ChunkId{0, 1}},
 	}
-	newChunks := extractNewChunks(chunks)
-	if len(newChunks) != 2 {
-		t.Error("New chunks should contain 2 slices")
-		t.Log("Actual: ", newChunks)
-	}
-	if len(newChunks[1]) != 2 {
-		t.Error("New chunks second slice should contain 2 chunks")
-		t.Log("Actual: ", newChunks[0])
-	}
-	if !reflect.DeepEqual(newChunks[1][0], chunks[2]) {
-		t.Error("New chunks do not match")
-		t.Log("Expected: ", chunks[2])
-		t.Log("Actual: ", newChunks[1][0])
-	}
+	newChunks := extractTempChunks(chunks)
+	assertLen(t, 2, newChunks, "New chunks:")
+	assertChunkContent(t, []byte{'a'}, newChunks[0], "First new:")
+	assertChunkContent(t, []byte{'b', 'c'}, newChunks[1], "Second New:")
 }
 
 func TestStoreLoadFiles(t *testing.T) {
@@ -150,9 +140,7 @@ func TestStoreLoadFiles(t *testing.T) {
 	files1 := listFiles(dataDir)
 	storeFileList(resultFiles, files1)
 	files2 := loadFileList(resultFiles)
-	if len(files1) != 4 {
-		t.Errorf("Incorrect number of files: %d, should be %d\n", len(files1), 4)
-	}
+	assertLen(t, 4, files1, "Files:")
 	for i, f := range files1 {
 		if f != files2[i] {
 			t.Errorf("Loaded file data %d does not match stored one", i)
@@ -189,22 +177,44 @@ func TestBsdiff(t *testing.T) {
 	go concatFiles(files, writer)
 	fingerprints, sketches := hashChunks(oldChunks)
 	recipe := repo.matchStream(reader, fingerprints)
-	newChunks := extractNewChunks(recipe)
-	log.Println("Checking new chunks:", len(newChunks[0]))
-	for _, chunks := range newChunks {
-		for _, c := range chunks {
-			id, exists := findSimilarChunk(c, sketches)
-			log.Println(id, exists)
-			if exists {
-				patch := new(bytes.Buffer)
-				stored := id.Reader(repo.path)
-				new := c.Reader()
-				bsdiff.Reader(stored, new, patch)
-				log.Println("Patch size:", patch.Len())
-				if patch.Len() >= chunkSize/10 {
-					t.Errorf("Bsdiff of chunk is too large: %d", patch.Len())
-				}
+	newChunks := extractTempChunks(recipe)
+	assertLen(t, 2, newChunks, "New chunks:")
+	for _, c := range newChunks {
+		id, exists := findSimilarChunk(c, sketches)
+		log.Println(id, exists)
+		if exists {
+			patch := new(bytes.Buffer)
+			stored := id.Reader(repo.path)
+			new := c.Reader()
+			bsdiff.Reader(stored, new, patch)
+			log.Println("Patch size:", patch.Len())
+			if patch.Len() >= chunkSize/10 {
+				t.Errorf("Bsdiff of chunk is too large: %d", patch.Len())
 			}
 		}
 	}
 }
+
+func assertLen(t *testing.T, expected int, actual interface{}, prefix string) {
+	s := reflect.ValueOf(actual)
+	if s.Len() != expected {
+		t.Error(prefix, "incorrect length, expected:", expected, ", actual:", s.Len())
+	}
+}
+
+func assertSameSlice(t *testing.T, expected []byte, actual []byte, prefix string) {
+	assertLen(t, len(expected), actual, prefix)
+	for i := 0; i < len(expected); i++ {
+		if expected[i] != actual[i] {
+			t.Fatal(prefix, "incorrect value", i, ", expected:", expected[i], ", actual:", actual[i])
+		}
+	}
+}
+
+func assertChunkContent(t *testing.T, expected []byte, c Chunk, prefix string) {
+	buf, err := io.ReadAll(c.Reader())
+	if err != nil {
+		t.Fatal(err)
+	}
+	assertSameSlice(t, expected, buf, prefix+" Chunk content")
+}
diff --git a/sketch.go b/sketch.go
index 9910848..c88f043 100644
--- a/sketch.go
+++ b/sketch.go
@@ -16,7 +16,7 @@ const fBytes = 8
 // sfCount: the number of super-features, and fCount: the number of feature
 // per super-feature
 func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) (Sketch, error) {
-	var fSize = chunkSize / (sfCount * fCount)
+	var fSize = FeatureSize(chunkSize, sfCount, fCount)
 	superfeatures := make([]uint64, 0, sfCount)
 	features := make([]uint64, 0, fCount*sfCount)
 	buff := make([]byte, fBytes*fCount)
@@ -49,3 +49,11 @@ func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) (Sketch, error
 	}
 	return superfeatures, nil
 }
+
+func SuperFeatureSize(chunkSize int, sfCount int, fCount int) int {
+	return FeatureSize(chunkSize, sfCount, fCount) * sfCount
+}
+
+func FeatureSize(chunkSize int, sfCount int, fCount int) int {
+	return chunkSize / (sfCount * fCount)
+}
author	n-peugnet <n.peugnet@free.fr>	2021-08-31 16:28:07 +0200
committer	n-peugnet <n.peugnet@free.fr>	2021-08-31 16:38:34 +0200
commit	504fe3db47c058807b656a8e63bb27c12420f268 (patch)
tree	5fec35a147b3234633d237601cc49627fbedf331
parent	c481eb2b44adf50b62de3b9e3355f64973967d52 (diff)
download	dna-backup-504fe3db47c058807b656a8e63bb27c12420f268.tar.gz dna-backup-504fe3db47c058807b656a8e63bb27c12420f268.zip