diff options
author | n-peugnet <n.peugnet@free.fr> | 2021-08-31 12:05:29 +0200 |
---|---|---|
committer | n-peugnet <n.peugnet@free.fr> | 2021-08-31 12:05:29 +0200 |
commit | c481eb2b44adf50b62de3b9e3355f64973967d52 (patch) | |
tree | 34a218c926f6aa6420c8abfcf703262e6148c0ed | |
parent | 36da6832dce67da09d7bcee1a6ab2312e515cb0a (diff) | |
download | dna-backup-c481eb2b44adf50b62de3b9e3355f64973967d52.tar.gz dna-backup-c481eb2b44adf50b62de3b9e3355f64973967d52.zip |
do not fill partial cunks with padding
this way a partial chunk may have less superfeatures than
a complete one
-rw-r--r-- | chunk.go | 28 | ||||
-rw-r--r-- | repo.go | 10 | ||||
-rw-r--r-- | repo_test.go | 10 | ||||
-rw-r--r-- | sketch.go | 39 | ||||
-rw-r--r-- | sketch_test.go | 10 | ||||
-rw-r--r-- | test/data/repo_8k/00000/chunks/000000000000014 | bin | 8192 -> 4710 bytes |
6 files changed, 64 insertions, 33 deletions
@@ -17,6 +17,7 @@ type ChunkReader interface { type Chunk interface { Reader() ChunkReader + Len() int } type StoredChunk interface { @@ -29,11 +30,15 @@ type ChunkId struct { Idx uint64 } +func (i *ChunkId) Path(repo string) string { + return path.Join(repo, fmt.Sprintf(versionFmt, i.Ver), chunksName, fmt.Sprintf(chunkIdFmt, i.Idx)) +} + func (i *ChunkId) Reader(repo string) ChunkReader { - p := path.Join(repo, fmt.Sprintf(versionFmt, i.Ver), chunksName, fmt.Sprintf(chunkIdFmt, i.Idx)) - f, err := os.Open(p) + path := i.Path(repo) + f, err := os.Open(path) if err != nil { - log.Printf("Cannot open chunk %s\n", p) + log.Println("Cannot open chunk: ", path) } return bufio.NewReaderSize(f, chunkSize) } @@ -56,6 +61,10 @@ func (c *LoadedChunk) Reader() ChunkReader { return bytes.NewReader(c.value) } +func (c *LoadedChunk) Len() int { + return len(c.value) +} + func NewChunkFile(repo *Repo, id *ChunkId) *ChunkFile { return &ChunkFile{repo: repo, id: id} } @@ -74,6 +83,15 @@ func (c *ChunkFile) Reader() ChunkReader { return c.id.Reader(c.repo.path) } +func (c *ChunkFile) Len() int { + path := c.id.Path(c.repo.path) + info, err := os.Stat(path) + if err != nil { + log.Println("Chunk: could not stat file:", path) + } + return int(info.Size()) +} + func NewTempChunk(value []byte) *TempChunk { return &TempChunk{value: value} } @@ -85,3 +103,7 @@ type TempChunk struct { func (c *TempChunk) Reader() ChunkReader { return bytes.NewReader(c.value) } + +func (c *TempChunk) Len() int { + return len(c.value) +} @@ -145,7 +145,7 @@ func chunkStream(stream io.Reader, chunks chan<- []byte) { } } if prev != chunkSize { - chunks <- buff + chunks <- buff[:prev] } close(chunks) } @@ -290,9 +290,7 @@ func (r *Repo) matchStream(stream io.Reader, fingerprints FingerprintMap) []Chun if len(buff) > chunkSize && len(buff) < chunkSize*2 { size := len(buff) - chunkSize log.Println("Add new partial chunk of size:", size) - tmp := make([]byte, 0, chunkSize) - tmp = append(tmp, buff[:size]...) - chunks = append(chunks, NewTempChunk(tmp[:chunkSize])) + chunks = append(chunks, NewTempChunk(buff[:size])) } log.Printf("Add existing chunk: %d\n", chunkId) chunks = append(chunks, NewChunkFile(r, chunkId)) @@ -321,10 +319,10 @@ func (r *Repo) matchStream(stream io.Reader, fingerprints FingerprintMap) []Chun log.Println("Add new chunk") chunks = append(chunks, NewTempChunk(buff[:chunkSize])) log.Println("Add new partial chunk of size:", len(buff)-chunkSize) - chunks = append(chunks, NewTempChunk(buff[chunkSize:chunkSize*2])) + chunks = append(chunks, NewTempChunk(buff[chunkSize:])) } else if len(buff) > 0 { log.Println("Add new partial chunk of size:", len(buff)) - chunks = append(chunks, NewTempChunk(buff[chunkSize:chunkSize*2])) + chunks = append(chunks, NewTempChunk(buff)) } return chunks } diff --git a/repo_test.go b/repo_test.go index cbb54df..cdd3024 100644 --- a/repo_test.go +++ b/repo_test.go @@ -35,16 +35,18 @@ func chunkCompare(t *testing.T, dataDir string, testFiles []string, chunkCount i i := 0 for c := range chunks { - content := buff[i*chunkSize : (i+1)*chunkSize] - if len(c) != chunkSize { - t.Errorf("Chunk %d is not of chunkSize: %d", i, chunkSize) + start := i * chunkSize + end := (i + 1) * chunkSize + if end > offset { + end = offset } + content := buff[start:end] if bytes.Compare(c, content) != 0 { t.Errorf("Chunk %d does not match file content", i) // for i, b := range c { // fmt.Printf("E: %d, A: %d\n", b, content[i]) // } - t.Log("Expected: ", c[:10], "...", c[chunkSize-10:]) + t.Log("Expected: ", c[:10], "...", c[end%chunkSize-10:]) t.Log("Actual:", content) } i++ @@ -18,31 +18,30 @@ const fBytes = 8 func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) (Sketch, error) { var fSize = chunkSize / (sfCount * fCount) superfeatures := make([]uint64, 0, sfCount) - features := make([]uint64, 0, fCount) + features := make([]uint64, 0, fCount*sfCount) buff := make([]byte, fBytes*fCount) r := chunk.Reader() hasher := rabinkarp64.New() - for sf := 0; sf < sfCount; sf++ { - features = features[:0] - for f := 0; f < fCount; f++ { - hasher.Reset() - n, err := io.CopyN(hasher, r, int64(wSize)) - if err != nil { - log.Println(n, err) - } - max := hasher.Sum64() - for w := 0; w < fSize-wSize; w++ { - b, _ := r.ReadByte() - hasher.Roll(b) - h := hasher.Sum64() - if h > max { - max = h - } + for f := 0; f < chunk.Len()/fSize; f++ { + hasher.Reset() + n, err := io.CopyN(hasher, r, int64(wSize)) + if err != nil { + log.Println(n, err) + } + max := hasher.Sum64() + for w := 0; w < fSize-wSize; w++ { + b, _ := r.ReadByte() + hasher.Roll(b) + h := hasher.Sum64() + if h > max { + max = h } - features = append(features, max) } - for i, f := range features { - binary.LittleEndian.PutUint64(buff[i*fBytes:(i+1)*fBytes], f) + features = append(features, max) + } + for sf := 0; sf < len(features)/fCount; sf++ { + for i := 0; i < fCount; i++ { + binary.LittleEndian.PutUint64(buff[i*fBytes:(i+1)*fBytes], features[i+sf*fCount]) } hasher.Reset() hasher.Write(buff) diff --git a/sketch_test.go b/sketch_test.go index 86a2c58..074bffd 100644 --- a/sketch_test.go +++ b/sketch_test.go @@ -24,6 +24,16 @@ func TestSketchChunk(t *testing.T) { t.Errorf("Sketch does not match, expected: %d, actual: %d", expected, sketch) } } + if i == 14 { + sketch, err := SketchChunk(c, 32, 3, 4) + if err != nil { + t.Error(err) + } + expected := Sketch{658454504014104} + if !reflect.DeepEqual(sketch, expected) { + t.Errorf("Sketch does not match, expected: %d, actual: %d", expected, sketch) + } + } i++ } } diff --git a/test/data/repo_8k/00000/chunks/000000000000014 b/test/data/repo_8k/00000/chunks/000000000000014 Binary files differindex d616ce9..ab7db22 100644 --- a/test/data/repo_8k/00000/chunks/000000000000014 +++ b/test/data/repo_8k/00000/chunks/000000000000014 |