aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--TODO.md16
-rw-r--r--repo.go17
2 files changed, 20 insertions, 13 deletions
diff --git a/TODO.md b/TODO.md
index 9d2c75f..084dfd6 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,7 +1,10 @@
priority 1
----------
- [x] add deltaEncode chunks function
- - [x] do not merge consecutive smaller chunks as these could be stored as chunks if no similar chunk is found. Thus it will need to be of `chunkSize` or less. Otherwise it could not be possibly used for deduplication.
+ - [x] do not merge consecutive smaller chunks as these could be stored as
+ chunks if no similar chunk is found. Thus it will need to be of
+ `chunkSize` or less. Otherwise it could not be possibly used for
+ deduplication.
```
for each new chunk:
find similar in sketchMap
@@ -12,11 +15,11 @@ priority 1
store in fingerprintMap
store in sketchMap
```
-- [ ] read from repo (Restore function)
+- [x] read from repo (Restore function)
- [x] store recipe
- [x] load recipe
- [x] read chunks in-order into a stream
- - [ ] read individual files
+- [ ] read individual files
- [ ] properly store informations to be DNA encoded
- [ ] tar source to keep files metadata ?
- [ ] store chunks compressed
@@ -28,11 +31,14 @@ priority 1
priority 2
----------
-- [x] use more the `Reader` API (which is analoguous to the `IOStream` in Java)
+- [ ] use more the `Reader` API (which is analoguous to the `IOStream` in Java)
- [ ] refactor matchStream as right now it is quite complex
- [x] better test for `(*Repo).matchStream`
-- [ ] tail packing of PartialChunks (this Struct does not exist yet as it is in fact just `TempChunks` for now)
+- [ ] tail packing of PartialChunks (this Struct does not exist yet as it is in
+ fact just `TempChunks` for now)
- [ ] option to commit without deltas to save new base chunks
+- [ ] custom binary marshall and unmarshal for chunks
+- [ ] use `loadChunkContent` in `loadChunks`
réunion 7/09
------------
diff --git a/repo.go b/repo.go
index ce8d890..21faa0a 100644
--- a/repo.go
+++ b/repo.go
@@ -313,16 +313,18 @@ func (r *Repo) chunkMinLen() int {
func (r *Repo) hashChunks(chunks <-chan IdentifiedChunk) {
hasher := rabinkarp64.NewFromPol(r.pol)
for c := range chunks {
- r.hashAndStoreChunk(c, hasher)
+ r.hashAndStoreChunk(c.GetId(), c.Reader(), hasher)
}
}
-func (r *Repo) hashAndStoreChunk(chunk IdentifiedChunk, hasher hash.Hash64) {
+func (r *Repo) hashAndStoreChunk(id *ChunkId, reader io.Reader, hasher hash.Hash64) {
+ var chunk bytes.Buffer
hasher.Reset()
- io.Copy(hasher, chunk.Reader())
+ reader = io.TeeReader(reader, &chunk)
+ io.Copy(hasher, reader)
fingerprint := hasher.Sum64()
- sketch, _ := sketch.SketchChunk(chunk.Reader(), r.pol, r.chunkSize, r.sketchWSize, r.sketchSfCount, r.sketchFCount)
- r.storeChunkId(chunk.GetId(), fingerprint, sketch)
+ sketch, _ := sketch.SketchChunk(&chunk, r.pol, r.chunkSize, r.sketchWSize, r.sketchSfCount, r.sketchFCount)
+ r.storeChunkId(id, fingerprint, sketch)
}
func (r *Repo) storeChunkId(id *ChunkId, fingerprint uint64, sketch []uint64) {
@@ -397,10 +399,9 @@ func (r *Repo) encodeTempChunk(temp BufferedChunk, version int, last *uint64) (c
if chunk.Len() == r.chunkSize {
id := &ChunkId{Ver: version, Idx: *last}
*last++
- ic := NewLoadedChunk(id, temp.Bytes())
hasher := rabinkarp64.NewFromPol(r.pol)
- r.hashAndStoreChunk(ic, hasher)
- r.StoreChunkContent(id, ic.Reader())
+ r.hashAndStoreChunk(id, temp.Reader(), hasher)
+ r.StoreChunkContent(id, temp.Reader())
log.Println("Add new chunk", id)
return NewStoredChunk(r, id), false
}