aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorn-peugnet <n.peugnet@free.fr>2021-08-24 18:31:13 +0200
committern-peugnet <n.peugnet@free.fr>2021-08-24 18:40:05 +0200
commitdf6d5f7e24a290718adf8f068649c3bc61f5eb4d (patch)
treeddc31a133a8e82b0529264962fd75d6515b1ab4e
parentb070eae35c1e7a4996b90208153d01f2be08d588 (diff)
downloaddna-backup-df6d5f7e24a290718adf8f068649c3bc61f5eb4d.tar.gz
dna-backup-df6d5f7e24a290718adf8f068649c3bc61f5eb4d.zip
refactor: extract chunk.og & add Reader getter
-rw-r--r--TODO.md11
-rw-r--r--chunk.go47
-rw-r--r--const.go11
-rw-r--r--repo.go42
-rw-r--r--repo_test.go13
5 files changed, 87 insertions, 37 deletions
diff --git a/TODO.md b/TODO.md
new file mode 100644
index 0000000..eb911f3
--- /dev/null
+++ b/TODO.md
@@ -0,0 +1,11 @@
+priority 1
+----------
+- join non-deduplicated chunks
+ - choose when and how to
+- detect Similar chunks
+ - implement "N-Transform SuperFeature" hash from Shilane-2012
+ - use the hash for detection
+
+priority 2
+----------
+- use more the `Reader` API (which is analoguous to the `IOStream` in Java)
diff --git a/chunk.go b/chunk.go
new file mode 100644
index 0000000..9f09e55
--- /dev/null
+++ b/chunk.go
@@ -0,0 +1,47 @@
+package main
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "path"
+)
+
+type ChunkId struct {
+ Ver int
+ Idx uint64
+}
+
+func (i *ChunkId) Reader(repo string) io.Reader {
+ p := path.Join(repo, fmt.Sprintf(versionFmt, i.Ver), chunksName, fmt.Sprintf(chunkIdFmt, i.Idx))
+ f, err := os.Open(p)
+ if err != nil {
+ log.Printf("Cannot open chunk %s\n", p)
+ }
+ return f
+}
+
+type Chunk struct {
+ Id *ChunkId
+ Value []byte
+}
+
+func (c *Chunk) Reader(repo string) (io.Reader, error) {
+ if c.Value != nil {
+ return bytes.NewReader(c.Value), nil
+ }
+ if c.Id != nil {
+ return c.Id.Reader(repo), nil
+ }
+ return nil, &ChunkError{"Uninitialized chunk"}
+}
+
+type ChunkError struct {
+ err string
+}
+
+func (e *ChunkError) Error() string {
+ return fmt.Sprintf("Chunk error: %s", e.err)
+}
diff --git a/const.go b/const.go
new file mode 100644
index 0000000..700d3df
--- /dev/null
+++ b/const.go
@@ -0,0 +1,11 @@
+package main
+
+// Defined as var to prevent from using them as const as I want to keep
+// beeing able to change tkem at runtime.
+var (
+ chunkSize = 8 << 10
+ chunksName = "chunks"
+ chunkIdFmt = "%015d"
+ versionFmt = "%05d"
+ filesName = "files"
+)
diff --git a/repo.go b/repo.go
index dd96600..3bb60b9 100644
--- a/repo.go
+++ b/repo.go
@@ -25,7 +25,6 @@ repo/
package main
import (
- "bytes"
"encoding/gob"
"fmt"
"hash"
@@ -39,10 +38,6 @@ import (
"github.com/chmduquesne/rollinghash/rabinkarp64"
)
-var chunkSize = 8 << 10
-var versionFmt = "%05d"
-var chunkIdFmt = "%015d"
-
type Repo struct {
path string
}
@@ -52,16 +47,6 @@ type File struct {
Size int64
}
-type ChunkId struct {
- Ver int
- Idx uint64
-}
-
-type Chunk struct {
- Id ChunkId
- Value []byte
-}
-
func NewRepo(path string) *Repo {
os.MkdirAll(path, 0775)
return &Repo{path}
@@ -71,8 +56,8 @@ func (r *Repo) Commit(source string) {
versions := r.loadVersions()
newVersion := len(versions)
newPath := path.Join(r.path, fmt.Sprintf(versionFmt, newVersion))
- newChunkPath := path.Join(newPath, "chunks")
- newFilesPath := path.Join(newPath, "files")
+ newChunkPath := path.Join(newPath, chunksName)
+ newFilesPath := path.Join(newPath, filesName)
os.Mkdir(newPath, 0775)
os.Mkdir(newChunkPath, 0775)
newChunks := make(chan []byte, 16)
@@ -188,7 +173,7 @@ func storeChunks(dest string, chunks <-chan []byte) {
func loadChunks(versions []string, chunks chan<- Chunk) {
for i, v := range versions {
- p := path.Join(v, "chunks")
+ p := path.Join(v, chunksName)
entries, err := os.ReadDir(p)
if err != nil {
log.Printf("Error reading version '%05d' in '%s' chunks: %s", i, v, err)
@@ -203,7 +188,7 @@ func loadChunks(versions []string, chunks chan<- Chunk) {
log.Printf("Error reading chunk '%s': %s", f, err.Error())
}
c := Chunk{
- Id: ChunkId{
+ Id: &ChunkId{
Ver: i,
Idx: uint64(j),
},
@@ -222,15 +207,15 @@ func hashChunks(chunks <-chan Chunk) map[uint64]ChunkId {
hasher.Reset()
hasher.Write(c.Value)
h := hasher.Sum64()
- hashes[h] = c.Id
+ hashes[h] = *c.Id
}
return hashes
}
-func (r *Repo) matchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) []io.Reader {
+func (r *Repo) matchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) []Chunk {
hasher := rabinkarp64.New()
hasher.Write(<-chunks)
- recipe := make([]io.Reader, 0)
+ recipe := make([]Chunk, 0)
var i uint64
var offset, prefill, postfill int
@@ -258,10 +243,10 @@ func (r *Repo) matchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) []io
hasher.Roll(c[postfill])
}
if len(buff) > 0 {
- recipe = append(recipe, bytes.NewReader(buff))
+ recipe = append(recipe, Chunk{Value: buff})
}
if exists {
- recipe = append(recipe, chunkId2Reader(chunkId, r.path))
+ recipe = append(recipe, Chunk{Id: &chunkId})
}
offset %= chunkSize
i++
@@ -269,15 +254,6 @@ func (r *Repo) matchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) []io
return recipe
}
-func chunkId2Reader(c ChunkId, repo string) io.Reader {
- p := path.Join(repo, fmt.Sprintf(versionFmt, c.Ver), "chunks", fmt.Sprintf(chunkIdFmt, c.Idx))
- f, err := os.Open(p)
- if err != nil {
- log.Printf("Cannot open chunk %s\n", p)
- }
- return f
-}
-
func writeFile(filePath string, object interface{}) error {
file, err := os.Create(filePath)
if err == nil {
diff --git a/repo_test.go b/repo_test.go
index bde2a63..d25ab8a 100644
--- a/repo_test.go
+++ b/repo_test.go
@@ -98,7 +98,7 @@ func TestLoadChunks(t *testing.T) {
resultDir := prepareResult()
dataDir := path.Join("test", "data")
resultVersion := path.Join(resultDir, "00000")
- resultChunks := path.Join(resultVersion, "chunks")
+ resultChunks := path.Join(resultVersion, chunksName)
os.MkdirAll(resultChunks, 0775)
chunks1 := make(chan []byte, 16)
chunks2 := make(chan []byte, 16)
@@ -125,7 +125,7 @@ func TestLoadChunks(t *testing.T) {
func TestStoreLoadFiles(t *testing.T) {
resultDir := prepareResult()
dataDir := path.Join("test", "data")
- resultFiles := path.Join(resultDir, "files")
+ resultFiles := path.Join(resultDir, filesName)
files1 := listFiles(dataDir)
storeFiles(resultFiles, files1)
files2 := loadFiles(resultFiles)
@@ -143,7 +143,7 @@ func TestBsdiff(t *testing.T) {
dataDir := path.Join("test", "data")
addedFile := path.Join(dataDir, "logs.2", "slogTest.log")
resultVersion := path.Join(resultDir, "00000")
- resultChunks := path.Join(resultVersion, "chunks")
+ resultChunks := path.Join(resultVersion, chunksName)
os.MkdirAll(resultChunks, 0775)
chunks := make(chan []byte, 16)
files := listFiles(dataDir)
@@ -163,7 +163,12 @@ func TestBsdiff(t *testing.T) {
hashes := hashChunks(oldChunks)
recipe := repo.matchChunks(newChunks, hashes)
buff := new(bytes.Buffer)
- bsdiff.Reader(recipe[2], recipe[0], buff)
+ r2, _ := recipe[2].Reader(repo.path)
+ r0, _ := recipe[0].Reader(repo.path)
+ bsdiff.Reader(r2, r0, buff)
+ if len(buff.Bytes()) < 500 {
+ t.Errorf("Bsdiff of chunk is too small: %d", len(buff.Bytes()))
+ }
if len(buff.Bytes()) >= chunkSize {
t.Errorf("Bsdiff of chunk is too large: %d", len(buff.Bytes()))
}