diff options
-rw-r--r-- | TODO.md | 16 | ||||
-rw-r--r-- | repo.go | 17 |
2 files changed, 20 insertions, 13 deletions
@@ -1,7 +1,10 @@ priority 1 ---------- - [x] add deltaEncode chunks function - - [x] do not merge consecutive smaller chunks as these could be stored as chunks if no similar chunk is found. Thus it will need to be of `chunkSize` or less. Otherwise it could not be possibly used for deduplication. + - [x] do not merge consecutive smaller chunks as these could be stored as + chunks if no similar chunk is found. Thus it will need to be of + `chunkSize` or less. Otherwise it could not be possibly used for + deduplication. ``` for each new chunk: find similar in sketchMap @@ -12,11 +15,11 @@ priority 1 store in fingerprintMap store in sketchMap ``` -- [ ] read from repo (Restore function) +- [x] read from repo (Restore function) - [x] store recipe - [x] load recipe - [x] read chunks in-order into a stream - - [ ] read individual files +- [ ] read individual files - [ ] properly store informations to be DNA encoded - [ ] tar source to keep files metadata ? - [ ] store chunks compressed @@ -28,11 +31,14 @@ priority 1 priority 2 ---------- -- [x] use more the `Reader` API (which is analoguous to the `IOStream` in Java) +- [ ] use more the `Reader` API (which is analoguous to the `IOStream` in Java) - [ ] refactor matchStream as right now it is quite complex - [x] better test for `(*Repo).matchStream` -- [ ] tail packing of PartialChunks (this Struct does not exist yet as it is in fact just `TempChunks` for now) +- [ ] tail packing of PartialChunks (this Struct does not exist yet as it is in + fact just `TempChunks` for now) - [ ] option to commit without deltas to save new base chunks +- [ ] custom binary marshall and unmarshal for chunks +- [ ] use `loadChunkContent` in `loadChunks` réunion 7/09 ------------ @@ -313,16 +313,18 @@ func (r *Repo) chunkMinLen() int { func (r *Repo) hashChunks(chunks <-chan IdentifiedChunk) { hasher := rabinkarp64.NewFromPol(r.pol) for c := range chunks { - r.hashAndStoreChunk(c, hasher) + r.hashAndStoreChunk(c.GetId(), c.Reader(), hasher) } } -func (r *Repo) hashAndStoreChunk(chunk IdentifiedChunk, hasher hash.Hash64) { +func (r *Repo) hashAndStoreChunk(id *ChunkId, reader io.Reader, hasher hash.Hash64) { + var chunk bytes.Buffer hasher.Reset() - io.Copy(hasher, chunk.Reader()) + reader = io.TeeReader(reader, &chunk) + io.Copy(hasher, reader) fingerprint := hasher.Sum64() - sketch, _ := sketch.SketchChunk(chunk.Reader(), r.pol, r.chunkSize, r.sketchWSize, r.sketchSfCount, r.sketchFCount) - r.storeChunkId(chunk.GetId(), fingerprint, sketch) + sketch, _ := sketch.SketchChunk(&chunk, r.pol, r.chunkSize, r.sketchWSize, r.sketchSfCount, r.sketchFCount) + r.storeChunkId(id, fingerprint, sketch) } func (r *Repo) storeChunkId(id *ChunkId, fingerprint uint64, sketch []uint64) { @@ -397,10 +399,9 @@ func (r *Repo) encodeTempChunk(temp BufferedChunk, version int, last *uint64) (c if chunk.Len() == r.chunkSize { id := &ChunkId{Ver: version, Idx: *last} *last++ - ic := NewLoadedChunk(id, temp.Bytes()) hasher := rabinkarp64.NewFromPol(r.pol) - r.hashAndStoreChunk(ic, hasher) - r.StoreChunkContent(id, ic.Reader()) + r.hashAndStoreChunk(id, temp.Reader(), hasher) + r.StoreChunkContent(id, temp.Reader()) log.Println("Add new chunk", id) return NewStoredChunk(r, id), false } |