diff options
author | n-peugnet <n.peugnet@free.fr> | 2021-08-31 16:28:07 +0200 |
---|---|---|
committer | n-peugnet <n.peugnet@free.fr> | 2021-08-31 16:38:34 +0200 |
commit | 504fe3db47c058807b656a8e63bb27c12420f268 (patch) | |
tree | 5fec35a147b3234633d237601cc49627fbedf331 | |
parent | c481eb2b44adf50b62de3b9e3355f64973967d52 (diff) | |
download | dna-backup-504fe3db47c058807b656a8e63bb27c12420f268.tar.gz dna-backup-504fe3db47c058807b656a8e63bb27c12420f268.zip |
join too small temp chunks with previous one if possible
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | TODO.md | 3 | ||||
-rw-r--r-- | chunk.go | 8 | ||||
-rw-r--r-- | repo.go | 34 | ||||
-rw-r--r-- | repo_test.go | 74 | ||||
-rw-r--r-- | sketch.go | 10 |
6 files changed, 82 insertions, 49 deletions
@@ -15,7 +15,7 @@ _Classical go_ go build # Test -go test +go test -v # Run ./dna-backup <source-dir> <repository> @@ -1,8 +1,7 @@ priority 1 ---------- -- join non-deduplicated chunks - - choose when and how to - delta encode chunks +- match stream against chunks from itself - read from repo priority 2 @@ -107,3 +107,11 @@ func (c *TempChunk) Reader() ChunkReader { func (c *TempChunk) Len() int { return len(c.value) } + +func (c *TempChunk) AppendFrom(r io.Reader) { + buff, err := io.ReadAll(r) + if err != nil { + println("Chunk: error appending to temp chunk:", err) + } + c.value = append(c.value, buff...) +} @@ -71,7 +71,7 @@ func (r *Repo) Commit(source string) { go concatFiles(files, writer) fingerprints, _ := hashChunks(oldChunks) chunks := r.matchStream(reader, fingerprints) - extractNewChunks(chunks) + extractTempChunks(chunks) // storeChunks(newChunkPath, newChunks) // storeFiles(newFilesPath, files) fmt.Println(files) @@ -327,24 +327,32 @@ func (r *Repo) matchStream(stream io.Reader, fingerprints FingerprintMap) []Chun return chunks } -// extractNewChunks extracts new chunks from an array of chunks and -// returns them in an array of consecutive new chunk's array -func extractNewChunks(chunks []Chunk) (ret [][]Chunk) { - var i int - ret = append(ret, nil) +// extractTempChunks extracts temporary chunks from an array of chunks. +// If a chunk is smaller than the size required to calculate a super-feature, +// it is then appended to the previous consecutive temporary chunk if it exists. +func extractTempChunks(chunks []Chunk) (ret []Chunk) { + var prev *TempChunk + var curr *TempChunk for _, c := range chunks { - _, isTmp := c.(*TempChunk) + tmp, isTmp := c.(*TempChunk) if !isTmp { - if len(ret[i]) != 0 { - i++ - ret = append(ret, nil) + if prev != nil && curr.Len() <= SuperFeatureSize(chunkSize, sketchSfCount, sketchFCount) { + prev.AppendFrom(curr.Reader()) + } else if curr != nil { + ret = append(ret, curr) } + curr = nil + prev = nil } else { - ret[i] = append(ret[i], c) + prev = curr + curr = tmp + if prev != nil { + ret = append(ret, prev) + } } } - if len(ret[i]) == 0 { - ret = ret[:i] + if curr != nil { + ret = append(ret, curr) } return } diff --git a/repo_test.go b/repo_test.go index cdd3024..134b55c 100644 --- a/repo_test.go +++ b/repo_test.go @@ -127,20 +127,10 @@ func TestExtractNewChunks(t *testing.T) { &TempChunk{value: []byte{'c'}}, &LoadedChunk{id: &ChunkId{0, 1}}, } - newChunks := extractNewChunks(chunks) - if len(newChunks) != 2 { - t.Error("New chunks should contain 2 slices") - t.Log("Actual: ", newChunks) - } - if len(newChunks[1]) != 2 { - t.Error("New chunks second slice should contain 2 chunks") - t.Log("Actual: ", newChunks[0]) - } - if !reflect.DeepEqual(newChunks[1][0], chunks[2]) { - t.Error("New chunks do not match") - t.Log("Expected: ", chunks[2]) - t.Log("Actual: ", newChunks[1][0]) - } + newChunks := extractTempChunks(chunks) + assertLen(t, 2, newChunks, "New chunks:") + assertChunkContent(t, []byte{'a'}, newChunks[0], "First new:") + assertChunkContent(t, []byte{'b', 'c'}, newChunks[1], "Second New:") } func TestStoreLoadFiles(t *testing.T) { @@ -150,9 +140,7 @@ func TestStoreLoadFiles(t *testing.T) { files1 := listFiles(dataDir) storeFileList(resultFiles, files1) files2 := loadFileList(resultFiles) - if len(files1) != 4 { - t.Errorf("Incorrect number of files: %d, should be %d\n", len(files1), 4) - } + assertLen(t, 4, files1, "Files:") for i, f := range files1 { if f != files2[i] { t.Errorf("Loaded file data %d does not match stored one", i) @@ -189,22 +177,44 @@ func TestBsdiff(t *testing.T) { go concatFiles(files, writer) fingerprints, sketches := hashChunks(oldChunks) recipe := repo.matchStream(reader, fingerprints) - newChunks := extractNewChunks(recipe) - log.Println("Checking new chunks:", len(newChunks[0])) - for _, chunks := range newChunks { - for _, c := range chunks { - id, exists := findSimilarChunk(c, sketches) - log.Println(id, exists) - if exists { - patch := new(bytes.Buffer) - stored := id.Reader(repo.path) - new := c.Reader() - bsdiff.Reader(stored, new, patch) - log.Println("Patch size:", patch.Len()) - if patch.Len() >= chunkSize/10 { - t.Errorf("Bsdiff of chunk is too large: %d", patch.Len()) - } + newChunks := extractTempChunks(recipe) + assertLen(t, 2, newChunks, "New chunks:") + for _, c := range newChunks { + id, exists := findSimilarChunk(c, sketches) + log.Println(id, exists) + if exists { + patch := new(bytes.Buffer) + stored := id.Reader(repo.path) + new := c.Reader() + bsdiff.Reader(stored, new, patch) + log.Println("Patch size:", patch.Len()) + if patch.Len() >= chunkSize/10 { + t.Errorf("Bsdiff of chunk is too large: %d", patch.Len()) } } } } + +func assertLen(t *testing.T, expected int, actual interface{}, prefix string) { + s := reflect.ValueOf(actual) + if s.Len() != expected { + t.Error(prefix, "incorrect length, expected:", expected, ", actual:", s.Len()) + } +} + +func assertSameSlice(t *testing.T, expected []byte, actual []byte, prefix string) { + assertLen(t, len(expected), actual, prefix) + for i := 0; i < len(expected); i++ { + if expected[i] != actual[i] { + t.Fatal(prefix, "incorrect value", i, ", expected:", expected[i], ", actual:", actual[i]) + } + } +} + +func assertChunkContent(t *testing.T, expected []byte, c Chunk, prefix string) { + buf, err := io.ReadAll(c.Reader()) + if err != nil { + t.Fatal(err) + } + assertSameSlice(t, expected, buf, prefix+" Chunk content") +} @@ -16,7 +16,7 @@ const fBytes = 8 // sfCount: the number of super-features, and fCount: the number of feature // per super-feature func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) (Sketch, error) { - var fSize = chunkSize / (sfCount * fCount) + var fSize = FeatureSize(chunkSize, sfCount, fCount) superfeatures := make([]uint64, 0, sfCount) features := make([]uint64, 0, fCount*sfCount) buff := make([]byte, fBytes*fCount) @@ -49,3 +49,11 @@ func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) (Sketch, error } return superfeatures, nil } + +func SuperFeatureSize(chunkSize int, sfCount int, fCount int) int { + return FeatureSize(chunkSize, sfCount, fCount) * sfCount +} + +func FeatureSize(chunkSize int, sfCount int, fCount int) int { + return chunkSize / (sfCount * fCount) +} |