aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorn-peugnet <n.peugnet@free.fr>2021-09-29 16:31:28 +0200
committern-peugnet <n.peugnet@free.fr>2021-09-29 16:31:28 +0200
commitc2cc8f8a6fb65488f19a5addf47d83e19aff6f4b (patch)
tree08300725c0bc0395e2b9c9bb9b511cf7b13a919a
parent5dad27953b9050f097b53227cfc29e3d3373fd97 (diff)
downloaddna-backup-c2cc8f8a6fb65488f19a5addf47d83e19aff6f4b.tar.gz
dna-backup-c2cc8f8a6fb65488f19a5addf47d83e19aff6f4b.zip
move old funcs to repo_test and add roundtrip test
-rw-r--r--TODO.md9
-rw-r--r--repo.go41
-rw-r--r--repo_test.go60
3 files changed, 69 insertions, 41 deletions
diff --git a/TODO.md b/TODO.md
index 6ace07e..96d8f1d 100644
--- a/TODO.md
+++ b/TODO.md
@@ -101,3 +101,12 @@ waiting for better matches will probably lower the size of the patches.
This has been fixed by making multiple passes until no more blocks are added,
this way we are assured that the result will be the same on the following run.
+
+mystical bug number 2 29/09
+---------------------------
+
+After modifying only one file of a big source folder, between the first and the
+second run of `dna-backup`, around 20 blocks in of the beginning of the recipe
+have been replaced by newer ones. These should not have been modified.
+
+I could however not reproduce it...
diff --git a/repo.go b/repo.go
index b9407fc..e633241 100644
--- a/repo.go
+++ b/repo.go
@@ -38,7 +38,6 @@ import (
"os"
"path/filepath"
"reflect"
- "sync"
"github.com/chmduquesne/rollinghash/rabinkarp64"
"github.com/n-peugnet/dna-backup/cache"
@@ -493,46 +492,6 @@ func (r *Repo) chunkMinLen() int {
return sketch.SuperFeatureSize(r.chunkSize, r.sketchSfCount, r.sketchFCount)
}
-// hashChunks calculates the hashes for a channel of chunks.
-// For each chunk, both a fingerprint (hash over the full content) and a sketch
-// (resemblance hash based on maximal values of regions) are calculated and
-// stored in an hashmap.
-func (r *Repo) hashChunks(chunks <-chan IdentifiedChunk) {
- for c := range chunks {
- r.hashChunk(c.GetId(), c.Reader())
- }
-}
-
-// hashChunk calculates the hashes for a chunk and store them in th repo hashmaps.
-func (r *Repo) hashChunk(id *ChunkId, reader io.Reader) (fp uint64, sk []uint64) {
- var buffSk bytes.Buffer
- var buffFp bytes.Buffer
- var wg sync.WaitGroup
- reader = io.TeeReader(reader, &buffSk)
- io.Copy(&buffFp, reader)
- wg.Add(2)
- go r.makeFingerprint(id, &buffFp, &wg, &fp)
- go r.makeSketch(id, &buffSk, &wg, &sk)
- wg.Wait()
- if _, e := r.fingerprints[fp]; e {
- logger.Error(fp, " already exists in fingerprints map")
- }
- r.fingerprints[fp] = id
- r.sketches.Set(sk, id)
- return
-}
-
-func (r *Repo) makeFingerprint(id *ChunkId, reader io.Reader, wg *sync.WaitGroup, ret *uint64) {
- defer wg.Done()
- hasher := rabinkarp64.NewFromPol(r.pol)
- io.Copy(hasher, reader)
- *ret = hasher.Sum64()
-}
-
-func (r *Repo) makeSketch(id *ChunkId, reader io.Reader, wg *sync.WaitGroup, ret *[]uint64) {
- defer wg.Done()
- *ret, _ = sketch.SketchChunk(reader, r.pol, r.chunkSize, r.sketchWSize, r.sketchSfCount, r.sketchFCount)
-}
func contains(s []*ChunkId, id *ChunkId) bool {
for _, v := range s {
if v == id {
diff --git a/repo_test.go b/repo_test.go
index 4b5c09b..35b3101 100644
--- a/repo_test.go
+++ b/repo_test.go
@@ -8,9 +8,12 @@ import (
"os"
"path/filepath"
"strings"
+ "sync"
"testing"
+ "github.com/chmduquesne/rollinghash/rabinkarp64"
"github.com/n-peugnet/dna-backup/logger"
+ "github.com/n-peugnet/dna-backup/sketch"
"github.com/n-peugnet/dna-backup/testutils"
"github.com/n-peugnet/dna-backup/utils"
)
@@ -96,6 +99,47 @@ func storeChunks(dest string, chunks <-chan []byte) {
}
}
+// hashChunks calculates the hashes for a channel of chunks.
+// For each chunk, both a fingerprint (hash over the full content) and a sketch
+// (resemblance hash based on maximal values of regions) are calculated and
+// stored in an hashmap.
+func (r *Repo) hashChunks(chunks <-chan IdentifiedChunk) {
+ for c := range chunks {
+ r.hashChunk(c.GetId(), c.Reader())
+ }
+}
+
+// hashChunk calculates the hashes for a chunk and store them in th repo hashmaps.
+func (r *Repo) hashChunk(id *ChunkId, reader io.Reader) (fp uint64, sk []uint64) {
+ var buffSk bytes.Buffer
+ var buffFp bytes.Buffer
+ var wg sync.WaitGroup
+ reader = io.TeeReader(reader, &buffSk)
+ io.Copy(&buffFp, reader)
+ wg.Add(2)
+ go r.makeFingerprint(id, &buffFp, &wg, &fp)
+ go r.makeSketch(id, &buffSk, &wg, &sk)
+ wg.Wait()
+ if _, e := r.fingerprints[fp]; e {
+ logger.Error(fp, " already exists in fingerprints map")
+ }
+ r.fingerprints[fp] = id
+ r.sketches.Set(sk, id)
+ return
+}
+
+func (r *Repo) makeFingerprint(id *ChunkId, reader io.Reader, wg *sync.WaitGroup, ret *uint64) {
+ defer wg.Done()
+ hasher := rabinkarp64.NewFromPol(r.pol)
+ io.Copy(hasher, reader)
+ *ret = hasher.Sum64()
+}
+
+func (r *Repo) makeSketch(id *ChunkId, reader io.Reader, wg *sync.WaitGroup, ret *[]uint64) {
+ defer wg.Done()
+ *ret, _ = sketch.SketchChunk(reader, r.pol, r.chunkSize, r.sketchWSize, r.sketchSfCount, r.sketchFCount)
+}
+
func TestReadFiles1(t *testing.T) {
tmpDir := t.TempDir()
repo := NewRepo(tmpDir)
@@ -308,6 +352,22 @@ func TestRestoreZlib(t *testing.T) {
assertSameTree(t, testutils.AssertSameFile, expected, dest, "Restore")
}
+func TestRoundtrip(t *testing.T) {
+ temp := t.TempDir()
+ dest := t.TempDir()
+ source := filepath.Join("testdata", "logs")
+ repo1 := NewRepo(temp)
+ repo2 := NewRepo(temp)
+
+ repo1.Commit(source)
+ // Commit a second version, just to see if it does not destroy everything
+ // TODO: check that the second version is indeed empty
+ repo1.Commit(source)
+ repo2.Restore(dest)
+
+ assertSameTree(t, assertCompatibleRepoFile, source, dest, "Commit")
+}
+
func TestHashes(t *testing.T) {
dest := t.TempDir()
source := filepath.Join("testdata", "repo_8k")