aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorn-peugnet <n.peugnet@free.fr>2021-08-23 21:12:55 +0200
committern-peugnet <n.peugnet@free.fr>2021-08-23 21:12:55 +0200
commitb070eae35c1e7a4996b90208153d01f2be08d588 (patch)
tree67069c772a376fcd7da199c6bb821cc7261cd523
parenta67dfdd993dee93950c4b60e99ab6ca92d842072 (diff)
downloaddna-backup-b070eae35c1e7a4996b90208153d01f2be08d588.tar.gz
dna-backup-b070eae35c1e7a4996b90208153d01f2be08d588.zip
try bsdiff and rework some of the API
-rw-r--r--go.mod5
-rw-r--r--go.sum4
-rw-r--r--main.go5
-rw-r--r--repo.go89
-rw-r--r--repo_test.go56
5 files changed, 116 insertions, 43 deletions
diff --git a/go.mod b/go.mod
index 26bdbbc..59fed50 100644
--- a/go.mod
+++ b/go.mod
@@ -2,4 +2,7 @@ module github.com/n-peugnet/dna-backup
go 1.16
-require github.com/chmduquesne/rollinghash v4.0.0+incompatible
+require (
+ github.com/chmduquesne/rollinghash v4.0.0+incompatible
+ github.com/gabstv/go-bsdiff v1.0.5
+)
diff --git a/go.sum b/go.sum
index c9d9a48..24e690c 100644
--- a/go.sum
+++ b/go.sum
@@ -1,2 +1,6 @@
github.com/chmduquesne/rollinghash v4.0.0+incompatible h1:hnREQO+DXjqIw3rUTzWN7/+Dpw+N5Um8zpKV0JOEgbo=
github.com/chmduquesne/rollinghash v4.0.0+incompatible/go.mod h1:Uc2I36RRfTAf7Dge82bi3RU0OQUmXT9iweIcPqvr8A0=
+github.com/dsnet/compress v0.0.0-20171208185109-cc9eb1d7ad76 h1:eX+pdPPlD279OWgdx7f6KqIRSONuK7egk+jDx7OM3Ac=
+github.com/dsnet/compress v0.0.0-20171208185109-cc9eb1d7ad76/go.mod h1:KjxHHirfLaw19iGT70HvVjHQsL1vq1SRQB4yOsAfy2s=
+github.com/gabstv/go-bsdiff v1.0.5 h1:g29MC/38Eaig+iAobW10/CiFvPtin8U3Jj4yNLcNG9k=
+github.com/gabstv/go-bsdiff v1.0.5/go.mod h1:/Zz6GK+/f/TMylRtVaW3uwZlb0FZITILfA0q12XKGwg=
diff --git a/main.go b/main.go
index f70faa4..691ff25 100644
--- a/main.go
+++ b/main.go
@@ -14,7 +14,6 @@ func main() {
source := os.Args[1]
dest := os.Args[2]
-
- os.MkdirAll(dest, 0775)
- Commit(source, dest)
+ repo := NewRepo(dest)
+ repo.Commit(source)
}
diff --git a/repo.go b/repo.go
index 19caf0d..dd96600 100644
--- a/repo.go
+++ b/repo.go
@@ -25,6 +25,7 @@ repo/
package main
import (
+ "bytes"
"encoding/gob"
"fmt"
"hash"
@@ -39,6 +40,12 @@ import (
)
var chunkSize = 8 << 10
+var versionFmt = "%05d"
+var chunkIdFmt = "%015d"
+
+type Repo struct {
+ path string
+}
type File struct {
Path string
@@ -55,29 +62,34 @@ type Chunk struct {
Value []byte
}
-func Commit(source string, repo string) {
- versions := LoadVersions(repo)
+func NewRepo(path string) *Repo {
+ os.MkdirAll(path, 0775)
+ return &Repo{path}
+}
+
+func (r *Repo) Commit(source string) {
+ versions := r.loadVersions()
newVersion := len(versions)
- newPath := path.Join(repo, fmt.Sprintf("%05d", newVersion))
+ newPath := path.Join(r.path, fmt.Sprintf(versionFmt, newVersion))
newChunkPath := path.Join(newPath, "chunks")
- // newFilesPath := path.Join(newPath, "files")
+ newFilesPath := path.Join(newPath, "files")
os.Mkdir(newPath, 0775)
os.Mkdir(newChunkPath, 0775)
newChunks := make(chan []byte, 16)
oldChunks := make(chan Chunk, 16)
- files := ListFiles(source)
- go LoadChunks(versions, oldChunks)
- go ReadFiles(files, newChunks)
- hashes := HashChunks(oldChunks)
- MatchChunks(newChunks, hashes)
- // StoreChunks(newChunkPath, newChunks)
- // StoreFiles(newFilesPath, files)
+ files := listFiles(source)
+ go loadChunks(versions, oldChunks)
+ go readFiles(files, newChunks)
+ // hashes := HashChunks(oldChunks)
+ // MatchChunks(newChunks, hashes)
+ storeChunks(newChunkPath, newChunks)
+ storeFiles(newFilesPath, files)
fmt.Println(files)
}
-func LoadVersions(repo string) []string {
+func (r *Repo) loadVersions() []string {
versions := make([]string, 0)
- files, err := os.ReadDir(repo)
+ files, err := os.ReadDir(r.path)
if err != nil {
log.Fatalln(err)
}
@@ -85,12 +97,12 @@ func LoadVersions(repo string) []string {
if !f.IsDir() {
continue
}
- versions = append(versions, path.Join(repo, f.Name()))
+ versions = append(versions, path.Join(r.path, f.Name()))
}
return versions
}
-func ListFiles(path string) []File {
+func listFiles(path string) []File {
var files []File
err := filepath.Walk(path,
func(p string, i fs.FileInfo, err error) error {
@@ -110,7 +122,7 @@ func ListFiles(path string) []File {
return files
}
-func ReadFiles(files []File, chunks chan<- []byte) {
+func readFiles(files []File, chunks chan<- []byte) {
var buff []byte
var prev, read = chunkSize, 0
@@ -140,14 +152,14 @@ func ReadFiles(files []File, chunks chan<- []byte) {
close(chunks)
}
-func StoreFiles(dest string, files []File) {
+func storeFiles(dest string, files []File) {
err := writeFile(dest, files)
if err != nil {
log.Println(err)
}
}
-func LoadFiles(path string) []File {
+func loadFiles(path string) []File {
files := make([]File, 0)
err := readFile(path, &files)
if err != nil {
@@ -156,16 +168,16 @@ func LoadFiles(path string) []File {
return files
}
-func PrintChunks(chunks <-chan []byte) {
+func printChunks(chunks <-chan []byte) {
for c := range chunks {
fmt.Println(c)
}
}
-func StoreChunks(dest string, chunks <-chan []byte) {
+func storeChunks(dest string, chunks <-chan []byte) {
i := 0
for c := range chunks {
- path := path.Join(dest, fmt.Sprintf("%015d", i))
+ path := path.Join(dest, fmt.Sprintf(chunkIdFmt, i))
err := os.WriteFile(path, c, 0664)
if err != nil {
log.Println(err)
@@ -174,7 +186,7 @@ func StoreChunks(dest string, chunks <-chan []byte) {
}
}
-func LoadChunks(versions []string, chunks chan<- Chunk) {
+func loadChunks(versions []string, chunks chan<- Chunk) {
for i, v := range versions {
p := path.Join(v, "chunks")
entries, err := os.ReadDir(p)
@@ -203,7 +215,7 @@ func LoadChunks(versions []string, chunks chan<- Chunk) {
close(chunks)
}
-func HashChunks(chunks <-chan Chunk) map[uint64]ChunkId {
+func hashChunks(chunks <-chan Chunk) map[uint64]ChunkId {
hashes := make(map[uint64]ChunkId)
hasher := hash.Hash64(rabinkarp64.New())
for c := range chunks {
@@ -215,15 +227,17 @@ func HashChunks(chunks <-chan Chunk) map[uint64]ChunkId {
return hashes
}
-func MatchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) {
+func (r *Repo) matchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) []io.Reader {
hasher := rabinkarp64.New()
hasher.Write(<-chunks)
+ recipe := make([]io.Reader, 0)
var i uint64
- var offset int
- var prefill int
- var postfill int
+ var offset, prefill, postfill int
+ var exists bool
+ var chunkId ChunkId
for c := range chunks {
+ buff := make([]byte, 0)
// Pre fill the window with the rest of the previous chunk
for prefill = 0; prefill < offset; prefill++ {
hasher.Roll(c[prefill])
@@ -231,20 +245,37 @@ func MatchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) {
// Fill the window with the current chunk and match hash byte by byte
for ; offset < len(c); offset++ {
h := hasher.Sum64()
- chunk, exists := hashes[h]
+ chunkId, exists = hashes[h]
if exists {
- fmt.Printf("Found existing chunk: New{id:%d, offset:%d} Old%d\n", i, offset, chunk)
+ // log.Printf("Found existing chunk: New{id:%d, offset:%d} Old%d\n", i, offset, chunkId)
break
}
hasher.Roll(c[offset])
+ buff = append(buff, c[offset])
}
// Fill the window with the rest of the current chunk if it matched early
for postfill = offset; postfill < len(c); postfill++ {
hasher.Roll(c[postfill])
}
+ if len(buff) > 0 {
+ recipe = append(recipe, bytes.NewReader(buff))
+ }
+ if exists {
+ recipe = append(recipe, chunkId2Reader(chunkId, r.path))
+ }
offset %= chunkSize
i++
}
+ return recipe
+}
+
+func chunkId2Reader(c ChunkId, repo string) io.Reader {
+ p := path.Join(repo, fmt.Sprintf(versionFmt, c.Ver), "chunks", fmt.Sprintf(chunkIdFmt, c.Idx))
+ f, err := os.Open(p)
+ if err != nil {
+ log.Printf("Cannot open chunk %s\n", p)
+ }
+ return f
}
func writeFile(filePath string, object interface{}) error {
diff --git a/repo_test.go b/repo_test.go
index 6475fc2..bde2a63 100644
--- a/repo_test.go
+++ b/repo_test.go
@@ -2,10 +2,13 @@ package main
import (
"bytes"
+ "io/ioutil"
"log"
"os"
"path"
"testing"
+
+ "github.com/gabstv/go-bsdiff/pkg/bsdiff"
)
func TestMain(m *testing.M) {
@@ -31,8 +34,8 @@ func prepareResult() string {
func chunkCompare(t *testing.T, dataDir string, testFiles []string, chunkCount int) {
chunks := make(chan []byte)
- files := ListFiles(dataDir)
- go ReadFiles(files, chunks)
+ files := listFiles(dataDir)
+ go readFiles(files, chunks)
offset := 0
buff := make([]byte, chunkSize*chunkCount)
@@ -100,12 +103,12 @@ func TestLoadChunks(t *testing.T) {
chunks1 := make(chan []byte, 16)
chunks2 := make(chan []byte, 16)
chunks3 := make(chan Chunk, 16)
- files := ListFiles(dataDir)
- go ReadFiles(files, chunks1)
- go ReadFiles(files, chunks2)
- StoreChunks(resultChunks, chunks1)
+ files := listFiles(dataDir)
+ go readFiles(files, chunks1)
+ go readFiles(files, chunks2)
+ storeChunks(resultChunks, chunks1)
versions := []string{resultVersion}
- go LoadChunks(versions, chunks3)
+ go loadChunks(versions, chunks3)
i := 0
for c2 := range chunks2 {
@@ -123,9 +126,9 @@ func TestStoreLoadFiles(t *testing.T) {
resultDir := prepareResult()
dataDir := path.Join("test", "data")
resultFiles := path.Join(resultDir, "files")
- files1 := ListFiles(dataDir)
- StoreFiles(resultFiles, files1)
- files2 := LoadFiles(resultFiles)
+ files1 := listFiles(dataDir)
+ storeFiles(resultFiles, files1)
+ files2 := loadFiles(resultFiles)
for i, f := range files1 {
if f != files2[i] {
t.Errorf("Loaded file data %d does not match stored one", i)
@@ -134,3 +137,36 @@ func TestStoreLoadFiles(t *testing.T) {
}
}
}
+
+func TestBsdiff(t *testing.T) {
+ resultDir := prepareResult()
+ dataDir := path.Join("test", "data")
+ addedFile := path.Join(dataDir, "logs.2", "slogTest.log")
+ resultVersion := path.Join(resultDir, "00000")
+ resultChunks := path.Join(resultVersion, "chunks")
+ os.MkdirAll(resultChunks, 0775)
+ chunks := make(chan []byte, 16)
+ files := listFiles(dataDir)
+ go readFiles(files, chunks)
+ storeChunks(resultChunks, chunks)
+
+ input, _ := ioutil.ReadFile(path.Join(dataDir, "logs.1", "logTest.log"))
+ ioutil.WriteFile(addedFile, input, 0664)
+
+ newChunks := make(chan []byte, 16)
+ oldChunks := make(chan Chunk, 16)
+ files = listFiles(dataDir)
+ repo := NewRepo(resultDir)
+ versions := repo.loadVersions()
+ go loadChunks(versions, oldChunks)
+ go readFiles(files, newChunks)
+ hashes := hashChunks(oldChunks)
+ recipe := repo.matchChunks(newChunks, hashes)
+ buff := new(bytes.Buffer)
+ bsdiff.Reader(recipe[2], recipe[0], buff)
+ if len(buff.Bytes()) >= chunkSize {
+ t.Errorf("Bsdiff of chunk is too large: %d", len(buff.Bytes()))
+ }
+
+ os.Remove(addedFile)
+}