diff options
author | n-peugnet <n.peugnet@free.fr> | 2021-09-29 16:31:28 +0200 |
---|---|---|
committer | n-peugnet <n.peugnet@free.fr> | 2021-09-29 16:31:28 +0200 |
commit | c2cc8f8a6fb65488f19a5addf47d83e19aff6f4b (patch) | |
tree | 08300725c0bc0395e2b9c9bb9b511cf7b13a919a | |
parent | 5dad27953b9050f097b53227cfc29e3d3373fd97 (diff) | |
download | dna-backup-c2cc8f8a6fb65488f19a5addf47d83e19aff6f4b.tar.gz dna-backup-c2cc8f8a6fb65488f19a5addf47d83e19aff6f4b.zip |
move old funcs to repo_test and add roundtrip test
-rw-r--r-- | TODO.md | 9 | ||||
-rw-r--r-- | repo.go | 41 | ||||
-rw-r--r-- | repo_test.go | 60 |
3 files changed, 69 insertions, 41 deletions
@@ -101,3 +101,12 @@ waiting for better matches will probably lower the size of the patches. This has been fixed by making multiple passes until no more blocks are added, this way we are assured that the result will be the same on the following run. + +mystical bug number 2 29/09 +--------------------------- + +After modifying only one file of a big source folder, between the first and the +second run of `dna-backup`, around 20 blocks in of the beginning of the recipe +have been replaced by newer ones. These should not have been modified. + +I could however not reproduce it... @@ -38,7 +38,6 @@ import ( "os" "path/filepath" "reflect" - "sync" "github.com/chmduquesne/rollinghash/rabinkarp64" "github.com/n-peugnet/dna-backup/cache" @@ -493,46 +492,6 @@ func (r *Repo) chunkMinLen() int { return sketch.SuperFeatureSize(r.chunkSize, r.sketchSfCount, r.sketchFCount) } -// hashChunks calculates the hashes for a channel of chunks. -// For each chunk, both a fingerprint (hash over the full content) and a sketch -// (resemblance hash based on maximal values of regions) are calculated and -// stored in an hashmap. -func (r *Repo) hashChunks(chunks <-chan IdentifiedChunk) { - for c := range chunks { - r.hashChunk(c.GetId(), c.Reader()) - } -} - -// hashChunk calculates the hashes for a chunk and store them in th repo hashmaps. -func (r *Repo) hashChunk(id *ChunkId, reader io.Reader) (fp uint64, sk []uint64) { - var buffSk bytes.Buffer - var buffFp bytes.Buffer - var wg sync.WaitGroup - reader = io.TeeReader(reader, &buffSk) - io.Copy(&buffFp, reader) - wg.Add(2) - go r.makeFingerprint(id, &buffFp, &wg, &fp) - go r.makeSketch(id, &buffSk, &wg, &sk) - wg.Wait() - if _, e := r.fingerprints[fp]; e { - logger.Error(fp, " already exists in fingerprints map") - } - r.fingerprints[fp] = id - r.sketches.Set(sk, id) - return -} - -func (r *Repo) makeFingerprint(id *ChunkId, reader io.Reader, wg *sync.WaitGroup, ret *uint64) { - defer wg.Done() - hasher := rabinkarp64.NewFromPol(r.pol) - io.Copy(hasher, reader) - *ret = hasher.Sum64() -} - -func (r *Repo) makeSketch(id *ChunkId, reader io.Reader, wg *sync.WaitGroup, ret *[]uint64) { - defer wg.Done() - *ret, _ = sketch.SketchChunk(reader, r.pol, r.chunkSize, r.sketchWSize, r.sketchSfCount, r.sketchFCount) -} func contains(s []*ChunkId, id *ChunkId) bool { for _, v := range s { if v == id { diff --git a/repo_test.go b/repo_test.go index 4b5c09b..35b3101 100644 --- a/repo_test.go +++ b/repo_test.go @@ -8,9 +8,12 @@ import ( "os" "path/filepath" "strings" + "sync" "testing" + "github.com/chmduquesne/rollinghash/rabinkarp64" "github.com/n-peugnet/dna-backup/logger" + "github.com/n-peugnet/dna-backup/sketch" "github.com/n-peugnet/dna-backup/testutils" "github.com/n-peugnet/dna-backup/utils" ) @@ -96,6 +99,47 @@ func storeChunks(dest string, chunks <-chan []byte) { } } +// hashChunks calculates the hashes for a channel of chunks. +// For each chunk, both a fingerprint (hash over the full content) and a sketch +// (resemblance hash based on maximal values of regions) are calculated and +// stored in an hashmap. +func (r *Repo) hashChunks(chunks <-chan IdentifiedChunk) { + for c := range chunks { + r.hashChunk(c.GetId(), c.Reader()) + } +} + +// hashChunk calculates the hashes for a chunk and store them in th repo hashmaps. +func (r *Repo) hashChunk(id *ChunkId, reader io.Reader) (fp uint64, sk []uint64) { + var buffSk bytes.Buffer + var buffFp bytes.Buffer + var wg sync.WaitGroup + reader = io.TeeReader(reader, &buffSk) + io.Copy(&buffFp, reader) + wg.Add(2) + go r.makeFingerprint(id, &buffFp, &wg, &fp) + go r.makeSketch(id, &buffSk, &wg, &sk) + wg.Wait() + if _, e := r.fingerprints[fp]; e { + logger.Error(fp, " already exists in fingerprints map") + } + r.fingerprints[fp] = id + r.sketches.Set(sk, id) + return +} + +func (r *Repo) makeFingerprint(id *ChunkId, reader io.Reader, wg *sync.WaitGroup, ret *uint64) { + defer wg.Done() + hasher := rabinkarp64.NewFromPol(r.pol) + io.Copy(hasher, reader) + *ret = hasher.Sum64() +} + +func (r *Repo) makeSketch(id *ChunkId, reader io.Reader, wg *sync.WaitGroup, ret *[]uint64) { + defer wg.Done() + *ret, _ = sketch.SketchChunk(reader, r.pol, r.chunkSize, r.sketchWSize, r.sketchSfCount, r.sketchFCount) +} + func TestReadFiles1(t *testing.T) { tmpDir := t.TempDir() repo := NewRepo(tmpDir) @@ -308,6 +352,22 @@ func TestRestoreZlib(t *testing.T) { assertSameTree(t, testutils.AssertSameFile, expected, dest, "Restore") } +func TestRoundtrip(t *testing.T) { + temp := t.TempDir() + dest := t.TempDir() + source := filepath.Join("testdata", "logs") + repo1 := NewRepo(temp) + repo2 := NewRepo(temp) + + repo1.Commit(source) + // Commit a second version, just to see if it does not destroy everything + // TODO: check that the second version is indeed empty + repo1.Commit(source) + repo2.Restore(dest) + + assertSameTree(t, assertCompatibleRepoFile, source, dest, "Commit") +} + func TestHashes(t *testing.T) { dest := t.TempDir() source := filepath.Join("testdata", "repo_8k") |