diff options
author | n-peugnet <n.peugnet@free.fr> | 2021-08-24 18:31:13 +0200 |
---|---|---|
committer | n-peugnet <n.peugnet@free.fr> | 2021-08-24 18:40:05 +0200 |
commit | df6d5f7e24a290718adf8f068649c3bc61f5eb4d (patch) | |
tree | ddc31a133a8e82b0529264962fd75d6515b1ab4e | |
parent | b070eae35c1e7a4996b90208153d01f2be08d588 (diff) | |
download | dna-backup-df6d5f7e24a290718adf8f068649c3bc61f5eb4d.tar.gz dna-backup-df6d5f7e24a290718adf8f068649c3bc61f5eb4d.zip |
refactor: extract chunk.og & add Reader getter
-rw-r--r-- | TODO.md | 11 | ||||
-rw-r--r-- | chunk.go | 47 | ||||
-rw-r--r-- | const.go | 11 | ||||
-rw-r--r-- | repo.go | 42 | ||||
-rw-r--r-- | repo_test.go | 13 |
5 files changed, 87 insertions, 37 deletions
@@ -0,0 +1,11 @@ +priority 1 +---------- +- join non-deduplicated chunks + - choose when and how to +- detect Similar chunks + - implement "N-Transform SuperFeature" hash from Shilane-2012 + - use the hash for detection + +priority 2 +---------- +- use more the `Reader` API (which is analoguous to the `IOStream` in Java) diff --git a/chunk.go b/chunk.go new file mode 100644 index 0000000..9f09e55 --- /dev/null +++ b/chunk.go @@ -0,0 +1,47 @@ +package main + +import ( + "bytes" + "fmt" + "io" + "log" + "os" + "path" +) + +type ChunkId struct { + Ver int + Idx uint64 +} + +func (i *ChunkId) Reader(repo string) io.Reader { + p := path.Join(repo, fmt.Sprintf(versionFmt, i.Ver), chunksName, fmt.Sprintf(chunkIdFmt, i.Idx)) + f, err := os.Open(p) + if err != nil { + log.Printf("Cannot open chunk %s\n", p) + } + return f +} + +type Chunk struct { + Id *ChunkId + Value []byte +} + +func (c *Chunk) Reader(repo string) (io.Reader, error) { + if c.Value != nil { + return bytes.NewReader(c.Value), nil + } + if c.Id != nil { + return c.Id.Reader(repo), nil + } + return nil, &ChunkError{"Uninitialized chunk"} +} + +type ChunkError struct { + err string +} + +func (e *ChunkError) Error() string { + return fmt.Sprintf("Chunk error: %s", e.err) +} diff --git a/const.go b/const.go new file mode 100644 index 0000000..700d3df --- /dev/null +++ b/const.go @@ -0,0 +1,11 @@ +package main + +// Defined as var to prevent from using them as const as I want to keep +// beeing able to change tkem at runtime. +var ( + chunkSize = 8 << 10 + chunksName = "chunks" + chunkIdFmt = "%015d" + versionFmt = "%05d" + filesName = "files" +) @@ -25,7 +25,6 @@ repo/ package main import ( - "bytes" "encoding/gob" "fmt" "hash" @@ -39,10 +38,6 @@ import ( "github.com/chmduquesne/rollinghash/rabinkarp64" ) -var chunkSize = 8 << 10 -var versionFmt = "%05d" -var chunkIdFmt = "%015d" - type Repo struct { path string } @@ -52,16 +47,6 @@ type File struct { Size int64 } -type ChunkId struct { - Ver int - Idx uint64 -} - -type Chunk struct { - Id ChunkId - Value []byte -} - func NewRepo(path string) *Repo { os.MkdirAll(path, 0775) return &Repo{path} @@ -71,8 +56,8 @@ func (r *Repo) Commit(source string) { versions := r.loadVersions() newVersion := len(versions) newPath := path.Join(r.path, fmt.Sprintf(versionFmt, newVersion)) - newChunkPath := path.Join(newPath, "chunks") - newFilesPath := path.Join(newPath, "files") + newChunkPath := path.Join(newPath, chunksName) + newFilesPath := path.Join(newPath, filesName) os.Mkdir(newPath, 0775) os.Mkdir(newChunkPath, 0775) newChunks := make(chan []byte, 16) @@ -188,7 +173,7 @@ func storeChunks(dest string, chunks <-chan []byte) { func loadChunks(versions []string, chunks chan<- Chunk) { for i, v := range versions { - p := path.Join(v, "chunks") + p := path.Join(v, chunksName) entries, err := os.ReadDir(p) if err != nil { log.Printf("Error reading version '%05d' in '%s' chunks: %s", i, v, err) @@ -203,7 +188,7 @@ func loadChunks(versions []string, chunks chan<- Chunk) { log.Printf("Error reading chunk '%s': %s", f, err.Error()) } c := Chunk{ - Id: ChunkId{ + Id: &ChunkId{ Ver: i, Idx: uint64(j), }, @@ -222,15 +207,15 @@ func hashChunks(chunks <-chan Chunk) map[uint64]ChunkId { hasher.Reset() hasher.Write(c.Value) h := hasher.Sum64() - hashes[h] = c.Id + hashes[h] = *c.Id } return hashes } -func (r *Repo) matchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) []io.Reader { +func (r *Repo) matchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) []Chunk { hasher := rabinkarp64.New() hasher.Write(<-chunks) - recipe := make([]io.Reader, 0) + recipe := make([]Chunk, 0) var i uint64 var offset, prefill, postfill int @@ -258,10 +243,10 @@ func (r *Repo) matchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) []io hasher.Roll(c[postfill]) } if len(buff) > 0 { - recipe = append(recipe, bytes.NewReader(buff)) + recipe = append(recipe, Chunk{Value: buff}) } if exists { - recipe = append(recipe, chunkId2Reader(chunkId, r.path)) + recipe = append(recipe, Chunk{Id: &chunkId}) } offset %= chunkSize i++ @@ -269,15 +254,6 @@ func (r *Repo) matchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) []io return recipe } -func chunkId2Reader(c ChunkId, repo string) io.Reader { - p := path.Join(repo, fmt.Sprintf(versionFmt, c.Ver), "chunks", fmt.Sprintf(chunkIdFmt, c.Idx)) - f, err := os.Open(p) - if err != nil { - log.Printf("Cannot open chunk %s\n", p) - } - return f -} - func writeFile(filePath string, object interface{}) error { file, err := os.Create(filePath) if err == nil { diff --git a/repo_test.go b/repo_test.go index bde2a63..d25ab8a 100644 --- a/repo_test.go +++ b/repo_test.go @@ -98,7 +98,7 @@ func TestLoadChunks(t *testing.T) { resultDir := prepareResult() dataDir := path.Join("test", "data") resultVersion := path.Join(resultDir, "00000") - resultChunks := path.Join(resultVersion, "chunks") + resultChunks := path.Join(resultVersion, chunksName) os.MkdirAll(resultChunks, 0775) chunks1 := make(chan []byte, 16) chunks2 := make(chan []byte, 16) @@ -125,7 +125,7 @@ func TestLoadChunks(t *testing.T) { func TestStoreLoadFiles(t *testing.T) { resultDir := prepareResult() dataDir := path.Join("test", "data") - resultFiles := path.Join(resultDir, "files") + resultFiles := path.Join(resultDir, filesName) files1 := listFiles(dataDir) storeFiles(resultFiles, files1) files2 := loadFiles(resultFiles) @@ -143,7 +143,7 @@ func TestBsdiff(t *testing.T) { dataDir := path.Join("test", "data") addedFile := path.Join(dataDir, "logs.2", "slogTest.log") resultVersion := path.Join(resultDir, "00000") - resultChunks := path.Join(resultVersion, "chunks") + resultChunks := path.Join(resultVersion, chunksName) os.MkdirAll(resultChunks, 0775) chunks := make(chan []byte, 16) files := listFiles(dataDir) @@ -163,7 +163,12 @@ func TestBsdiff(t *testing.T) { hashes := hashChunks(oldChunks) recipe := repo.matchChunks(newChunks, hashes) buff := new(bytes.Buffer) - bsdiff.Reader(recipe[2], recipe[0], buff) + r2, _ := recipe[2].Reader(repo.path) + r0, _ := recipe[0].Reader(repo.path) + bsdiff.Reader(r2, r0, buff) + if len(buff.Bytes()) < 500 { + t.Errorf("Bsdiff of chunk is too small: %d", len(buff.Bytes())) + } if len(buff.Bytes()) >= chunkSize { t.Errorf("Bsdiff of chunk is too large: %d", len(buff.Bytes())) } |