aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorn-peugnet <n.peugnet@free.fr>2021-08-31 16:28:07 +0200
committern-peugnet <n.peugnet@free.fr>2021-08-31 16:38:34 +0200
commit504fe3db47c058807b656a8e63bb27c12420f268 (patch)
tree5fec35a147b3234633d237601cc49627fbedf331
parentc481eb2b44adf50b62de3b9e3355f64973967d52 (diff)
downloaddna-backup-504fe3db47c058807b656a8e63bb27c12420f268.tar.gz
dna-backup-504fe3db47c058807b656a8e63bb27c12420f268.zip
join too small temp chunks with previous one if possible
-rw-r--r--README.md2
-rw-r--r--TODO.md3
-rw-r--r--chunk.go8
-rw-r--r--repo.go34
-rw-r--r--repo_test.go74
-rw-r--r--sketch.go10
6 files changed, 82 insertions, 49 deletions
diff --git a/README.md b/README.md
index 8d90902..b5b5097 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ _Classical go_
go build
# Test
-go test
+go test -v
# Run
./dna-backup <source-dir> <repository>
diff --git a/TODO.md b/TODO.md
index 2d7f790..23454f7 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,8 +1,7 @@
priority 1
----------
-- join non-deduplicated chunks
- - choose when and how to
- delta encode chunks
+- match stream against chunks from itself
- read from repo
priority 2
diff --git a/chunk.go b/chunk.go
index 78e689d..092b758 100644
--- a/chunk.go
+++ b/chunk.go
@@ -107,3 +107,11 @@ func (c *TempChunk) Reader() ChunkReader {
func (c *TempChunk) Len() int {
return len(c.value)
}
+
+func (c *TempChunk) AppendFrom(r io.Reader) {
+ buff, err := io.ReadAll(r)
+ if err != nil {
+ println("Chunk: error appending to temp chunk:", err)
+ }
+ c.value = append(c.value, buff...)
+}
diff --git a/repo.go b/repo.go
index 55e830f..d32d437 100644
--- a/repo.go
+++ b/repo.go
@@ -71,7 +71,7 @@ func (r *Repo) Commit(source string) {
go concatFiles(files, writer)
fingerprints, _ := hashChunks(oldChunks)
chunks := r.matchStream(reader, fingerprints)
- extractNewChunks(chunks)
+ extractTempChunks(chunks)
// storeChunks(newChunkPath, newChunks)
// storeFiles(newFilesPath, files)
fmt.Println(files)
@@ -327,24 +327,32 @@ func (r *Repo) matchStream(stream io.Reader, fingerprints FingerprintMap) []Chun
return chunks
}
-// extractNewChunks extracts new chunks from an array of chunks and
-// returns them in an array of consecutive new chunk's array
-func extractNewChunks(chunks []Chunk) (ret [][]Chunk) {
- var i int
- ret = append(ret, nil)
+// extractTempChunks extracts temporary chunks from an array of chunks.
+// If a chunk is smaller than the size required to calculate a super-feature,
+// it is then appended to the previous consecutive temporary chunk if it exists.
+func extractTempChunks(chunks []Chunk) (ret []Chunk) {
+ var prev *TempChunk
+ var curr *TempChunk
for _, c := range chunks {
- _, isTmp := c.(*TempChunk)
+ tmp, isTmp := c.(*TempChunk)
if !isTmp {
- if len(ret[i]) != 0 {
- i++
- ret = append(ret, nil)
+ if prev != nil && curr.Len() <= SuperFeatureSize(chunkSize, sketchSfCount, sketchFCount) {
+ prev.AppendFrom(curr.Reader())
+ } else if curr != nil {
+ ret = append(ret, curr)
}
+ curr = nil
+ prev = nil
} else {
- ret[i] = append(ret[i], c)
+ prev = curr
+ curr = tmp
+ if prev != nil {
+ ret = append(ret, prev)
+ }
}
}
- if len(ret[i]) == 0 {
- ret = ret[:i]
+ if curr != nil {
+ ret = append(ret, curr)
}
return
}
diff --git a/repo_test.go b/repo_test.go
index cdd3024..134b55c 100644
--- a/repo_test.go
+++ b/repo_test.go
@@ -127,20 +127,10 @@ func TestExtractNewChunks(t *testing.T) {
&TempChunk{value: []byte{'c'}},
&LoadedChunk{id: &ChunkId{0, 1}},
}
- newChunks := extractNewChunks(chunks)
- if len(newChunks) != 2 {
- t.Error("New chunks should contain 2 slices")
- t.Log("Actual: ", newChunks)
- }
- if len(newChunks[1]) != 2 {
- t.Error("New chunks second slice should contain 2 chunks")
- t.Log("Actual: ", newChunks[0])
- }
- if !reflect.DeepEqual(newChunks[1][0], chunks[2]) {
- t.Error("New chunks do not match")
- t.Log("Expected: ", chunks[2])
- t.Log("Actual: ", newChunks[1][0])
- }
+ newChunks := extractTempChunks(chunks)
+ assertLen(t, 2, newChunks, "New chunks:")
+ assertChunkContent(t, []byte{'a'}, newChunks[0], "First new:")
+ assertChunkContent(t, []byte{'b', 'c'}, newChunks[1], "Second New:")
}
func TestStoreLoadFiles(t *testing.T) {
@@ -150,9 +140,7 @@ func TestStoreLoadFiles(t *testing.T) {
files1 := listFiles(dataDir)
storeFileList(resultFiles, files1)
files2 := loadFileList(resultFiles)
- if len(files1) != 4 {
- t.Errorf("Incorrect number of files: %d, should be %d\n", len(files1), 4)
- }
+ assertLen(t, 4, files1, "Files:")
for i, f := range files1 {
if f != files2[i] {
t.Errorf("Loaded file data %d does not match stored one", i)
@@ -189,22 +177,44 @@ func TestBsdiff(t *testing.T) {
go concatFiles(files, writer)
fingerprints, sketches := hashChunks(oldChunks)
recipe := repo.matchStream(reader, fingerprints)
- newChunks := extractNewChunks(recipe)
- log.Println("Checking new chunks:", len(newChunks[0]))
- for _, chunks := range newChunks {
- for _, c := range chunks {
- id, exists := findSimilarChunk(c, sketches)
- log.Println(id, exists)
- if exists {
- patch := new(bytes.Buffer)
- stored := id.Reader(repo.path)
- new := c.Reader()
- bsdiff.Reader(stored, new, patch)
- log.Println("Patch size:", patch.Len())
- if patch.Len() >= chunkSize/10 {
- t.Errorf("Bsdiff of chunk is too large: %d", patch.Len())
- }
+ newChunks := extractTempChunks(recipe)
+ assertLen(t, 2, newChunks, "New chunks:")
+ for _, c := range newChunks {
+ id, exists := findSimilarChunk(c, sketches)
+ log.Println(id, exists)
+ if exists {
+ patch := new(bytes.Buffer)
+ stored := id.Reader(repo.path)
+ new := c.Reader()
+ bsdiff.Reader(stored, new, patch)
+ log.Println("Patch size:", patch.Len())
+ if patch.Len() >= chunkSize/10 {
+ t.Errorf("Bsdiff of chunk is too large: %d", patch.Len())
}
}
}
}
+
+func assertLen(t *testing.T, expected int, actual interface{}, prefix string) {
+ s := reflect.ValueOf(actual)
+ if s.Len() != expected {
+ t.Error(prefix, "incorrect length, expected:", expected, ", actual:", s.Len())
+ }
+}
+
+func assertSameSlice(t *testing.T, expected []byte, actual []byte, prefix string) {
+ assertLen(t, len(expected), actual, prefix)
+ for i := 0; i < len(expected); i++ {
+ if expected[i] != actual[i] {
+ t.Fatal(prefix, "incorrect value", i, ", expected:", expected[i], ", actual:", actual[i])
+ }
+ }
+}
+
+func assertChunkContent(t *testing.T, expected []byte, c Chunk, prefix string) {
+ buf, err := io.ReadAll(c.Reader())
+ if err != nil {
+ t.Fatal(err)
+ }
+ assertSameSlice(t, expected, buf, prefix+" Chunk content")
+}
diff --git a/sketch.go b/sketch.go
index 9910848..c88f043 100644
--- a/sketch.go
+++ b/sketch.go
@@ -16,7 +16,7 @@ const fBytes = 8
// sfCount: the number of super-features, and fCount: the number of feature
// per super-feature
func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) (Sketch, error) {
- var fSize = chunkSize / (sfCount * fCount)
+ var fSize = FeatureSize(chunkSize, sfCount, fCount)
superfeatures := make([]uint64, 0, sfCount)
features := make([]uint64, 0, fCount*sfCount)
buff := make([]byte, fBytes*fCount)
@@ -49,3 +49,11 @@ func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) (Sketch, error
}
return superfeatures, nil
}
+
+func SuperFeatureSize(chunkSize int, sfCount int, fCount int) int {
+ return FeatureSize(chunkSize, sfCount, fCount) * sfCount
+}
+
+func FeatureSize(chunkSize int, sfCount int, fCount int) int {
+ return chunkSize / (sfCount * fCount)
+}