diff options
author | n-peugnet <n.peugnet@free.fr> | 2021-08-23 21:12:55 +0200 |
---|---|---|
committer | n-peugnet <n.peugnet@free.fr> | 2021-08-23 21:12:55 +0200 |
commit | b070eae35c1e7a4996b90208153d01f2be08d588 (patch) | |
tree | 67069c772a376fcd7da199c6bb821cc7261cd523 | |
parent | a67dfdd993dee93950c4b60e99ab6ca92d842072 (diff) | |
download | dna-backup-b070eae35c1e7a4996b90208153d01f2be08d588.tar.gz dna-backup-b070eae35c1e7a4996b90208153d01f2be08d588.zip |
try bsdiff and rework some of the API
-rw-r--r-- | go.mod | 5 | ||||
-rw-r--r-- | go.sum | 4 | ||||
-rw-r--r-- | main.go | 5 | ||||
-rw-r--r-- | repo.go | 89 | ||||
-rw-r--r-- | repo_test.go | 56 |
5 files changed, 116 insertions, 43 deletions
@@ -2,4 +2,7 @@ module github.com/n-peugnet/dna-backup go 1.16 -require github.com/chmduquesne/rollinghash v4.0.0+incompatible +require ( + github.com/chmduquesne/rollinghash v4.0.0+incompatible + github.com/gabstv/go-bsdiff v1.0.5 +) @@ -1,2 +1,6 @@ github.com/chmduquesne/rollinghash v4.0.0+incompatible h1:hnREQO+DXjqIw3rUTzWN7/+Dpw+N5Um8zpKV0JOEgbo= github.com/chmduquesne/rollinghash v4.0.0+incompatible/go.mod h1:Uc2I36RRfTAf7Dge82bi3RU0OQUmXT9iweIcPqvr8A0= +github.com/dsnet/compress v0.0.0-20171208185109-cc9eb1d7ad76 h1:eX+pdPPlD279OWgdx7f6KqIRSONuK7egk+jDx7OM3Ac= +github.com/dsnet/compress v0.0.0-20171208185109-cc9eb1d7ad76/go.mod h1:KjxHHirfLaw19iGT70HvVjHQsL1vq1SRQB4yOsAfy2s= +github.com/gabstv/go-bsdiff v1.0.5 h1:g29MC/38Eaig+iAobW10/CiFvPtin8U3Jj4yNLcNG9k= +github.com/gabstv/go-bsdiff v1.0.5/go.mod h1:/Zz6GK+/f/TMylRtVaW3uwZlb0FZITILfA0q12XKGwg= @@ -14,7 +14,6 @@ func main() { source := os.Args[1] dest := os.Args[2] - - os.MkdirAll(dest, 0775) - Commit(source, dest) + repo := NewRepo(dest) + repo.Commit(source) } @@ -25,6 +25,7 @@ repo/ package main import ( + "bytes" "encoding/gob" "fmt" "hash" @@ -39,6 +40,12 @@ import ( ) var chunkSize = 8 << 10 +var versionFmt = "%05d" +var chunkIdFmt = "%015d" + +type Repo struct { + path string +} type File struct { Path string @@ -55,29 +62,34 @@ type Chunk struct { Value []byte } -func Commit(source string, repo string) { - versions := LoadVersions(repo) +func NewRepo(path string) *Repo { + os.MkdirAll(path, 0775) + return &Repo{path} +} + +func (r *Repo) Commit(source string) { + versions := r.loadVersions() newVersion := len(versions) - newPath := path.Join(repo, fmt.Sprintf("%05d", newVersion)) + newPath := path.Join(r.path, fmt.Sprintf(versionFmt, newVersion)) newChunkPath := path.Join(newPath, "chunks") - // newFilesPath := path.Join(newPath, "files") + newFilesPath := path.Join(newPath, "files") os.Mkdir(newPath, 0775) os.Mkdir(newChunkPath, 0775) newChunks := make(chan []byte, 16) oldChunks := make(chan Chunk, 16) - files := ListFiles(source) - go LoadChunks(versions, oldChunks) - go ReadFiles(files, newChunks) - hashes := HashChunks(oldChunks) - MatchChunks(newChunks, hashes) - // StoreChunks(newChunkPath, newChunks) - // StoreFiles(newFilesPath, files) + files := listFiles(source) + go loadChunks(versions, oldChunks) + go readFiles(files, newChunks) + // hashes := HashChunks(oldChunks) + // MatchChunks(newChunks, hashes) + storeChunks(newChunkPath, newChunks) + storeFiles(newFilesPath, files) fmt.Println(files) } -func LoadVersions(repo string) []string { +func (r *Repo) loadVersions() []string { versions := make([]string, 0) - files, err := os.ReadDir(repo) + files, err := os.ReadDir(r.path) if err != nil { log.Fatalln(err) } @@ -85,12 +97,12 @@ func LoadVersions(repo string) []string { if !f.IsDir() { continue } - versions = append(versions, path.Join(repo, f.Name())) + versions = append(versions, path.Join(r.path, f.Name())) } return versions } -func ListFiles(path string) []File { +func listFiles(path string) []File { var files []File err := filepath.Walk(path, func(p string, i fs.FileInfo, err error) error { @@ -110,7 +122,7 @@ func ListFiles(path string) []File { return files } -func ReadFiles(files []File, chunks chan<- []byte) { +func readFiles(files []File, chunks chan<- []byte) { var buff []byte var prev, read = chunkSize, 0 @@ -140,14 +152,14 @@ func ReadFiles(files []File, chunks chan<- []byte) { close(chunks) } -func StoreFiles(dest string, files []File) { +func storeFiles(dest string, files []File) { err := writeFile(dest, files) if err != nil { log.Println(err) } } -func LoadFiles(path string) []File { +func loadFiles(path string) []File { files := make([]File, 0) err := readFile(path, &files) if err != nil { @@ -156,16 +168,16 @@ func LoadFiles(path string) []File { return files } -func PrintChunks(chunks <-chan []byte) { +func printChunks(chunks <-chan []byte) { for c := range chunks { fmt.Println(c) } } -func StoreChunks(dest string, chunks <-chan []byte) { +func storeChunks(dest string, chunks <-chan []byte) { i := 0 for c := range chunks { - path := path.Join(dest, fmt.Sprintf("%015d", i)) + path := path.Join(dest, fmt.Sprintf(chunkIdFmt, i)) err := os.WriteFile(path, c, 0664) if err != nil { log.Println(err) @@ -174,7 +186,7 @@ func StoreChunks(dest string, chunks <-chan []byte) { } } -func LoadChunks(versions []string, chunks chan<- Chunk) { +func loadChunks(versions []string, chunks chan<- Chunk) { for i, v := range versions { p := path.Join(v, "chunks") entries, err := os.ReadDir(p) @@ -203,7 +215,7 @@ func LoadChunks(versions []string, chunks chan<- Chunk) { close(chunks) } -func HashChunks(chunks <-chan Chunk) map[uint64]ChunkId { +func hashChunks(chunks <-chan Chunk) map[uint64]ChunkId { hashes := make(map[uint64]ChunkId) hasher := hash.Hash64(rabinkarp64.New()) for c := range chunks { @@ -215,15 +227,17 @@ func HashChunks(chunks <-chan Chunk) map[uint64]ChunkId { return hashes } -func MatchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) { +func (r *Repo) matchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) []io.Reader { hasher := rabinkarp64.New() hasher.Write(<-chunks) + recipe := make([]io.Reader, 0) var i uint64 - var offset int - var prefill int - var postfill int + var offset, prefill, postfill int + var exists bool + var chunkId ChunkId for c := range chunks { + buff := make([]byte, 0) // Pre fill the window with the rest of the previous chunk for prefill = 0; prefill < offset; prefill++ { hasher.Roll(c[prefill]) @@ -231,20 +245,37 @@ func MatchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) { // Fill the window with the current chunk and match hash byte by byte for ; offset < len(c); offset++ { h := hasher.Sum64() - chunk, exists := hashes[h] + chunkId, exists = hashes[h] if exists { - fmt.Printf("Found existing chunk: New{id:%d, offset:%d} Old%d\n", i, offset, chunk) + // log.Printf("Found existing chunk: New{id:%d, offset:%d} Old%d\n", i, offset, chunkId) break } hasher.Roll(c[offset]) + buff = append(buff, c[offset]) } // Fill the window with the rest of the current chunk if it matched early for postfill = offset; postfill < len(c); postfill++ { hasher.Roll(c[postfill]) } + if len(buff) > 0 { + recipe = append(recipe, bytes.NewReader(buff)) + } + if exists { + recipe = append(recipe, chunkId2Reader(chunkId, r.path)) + } offset %= chunkSize i++ } + return recipe +} + +func chunkId2Reader(c ChunkId, repo string) io.Reader { + p := path.Join(repo, fmt.Sprintf(versionFmt, c.Ver), "chunks", fmt.Sprintf(chunkIdFmt, c.Idx)) + f, err := os.Open(p) + if err != nil { + log.Printf("Cannot open chunk %s\n", p) + } + return f } func writeFile(filePath string, object interface{}) error { diff --git a/repo_test.go b/repo_test.go index 6475fc2..bde2a63 100644 --- a/repo_test.go +++ b/repo_test.go @@ -2,10 +2,13 @@ package main import ( "bytes" + "io/ioutil" "log" "os" "path" "testing" + + "github.com/gabstv/go-bsdiff/pkg/bsdiff" ) func TestMain(m *testing.M) { @@ -31,8 +34,8 @@ func prepareResult() string { func chunkCompare(t *testing.T, dataDir string, testFiles []string, chunkCount int) { chunks := make(chan []byte) - files := ListFiles(dataDir) - go ReadFiles(files, chunks) + files := listFiles(dataDir) + go readFiles(files, chunks) offset := 0 buff := make([]byte, chunkSize*chunkCount) @@ -100,12 +103,12 @@ func TestLoadChunks(t *testing.T) { chunks1 := make(chan []byte, 16) chunks2 := make(chan []byte, 16) chunks3 := make(chan Chunk, 16) - files := ListFiles(dataDir) - go ReadFiles(files, chunks1) - go ReadFiles(files, chunks2) - StoreChunks(resultChunks, chunks1) + files := listFiles(dataDir) + go readFiles(files, chunks1) + go readFiles(files, chunks2) + storeChunks(resultChunks, chunks1) versions := []string{resultVersion} - go LoadChunks(versions, chunks3) + go loadChunks(versions, chunks3) i := 0 for c2 := range chunks2 { @@ -123,9 +126,9 @@ func TestStoreLoadFiles(t *testing.T) { resultDir := prepareResult() dataDir := path.Join("test", "data") resultFiles := path.Join(resultDir, "files") - files1 := ListFiles(dataDir) - StoreFiles(resultFiles, files1) - files2 := LoadFiles(resultFiles) + files1 := listFiles(dataDir) + storeFiles(resultFiles, files1) + files2 := loadFiles(resultFiles) for i, f := range files1 { if f != files2[i] { t.Errorf("Loaded file data %d does not match stored one", i) @@ -134,3 +137,36 @@ func TestStoreLoadFiles(t *testing.T) { } } } + +func TestBsdiff(t *testing.T) { + resultDir := prepareResult() + dataDir := path.Join("test", "data") + addedFile := path.Join(dataDir, "logs.2", "slogTest.log") + resultVersion := path.Join(resultDir, "00000") + resultChunks := path.Join(resultVersion, "chunks") + os.MkdirAll(resultChunks, 0775) + chunks := make(chan []byte, 16) + files := listFiles(dataDir) + go readFiles(files, chunks) + storeChunks(resultChunks, chunks) + + input, _ := ioutil.ReadFile(path.Join(dataDir, "logs.1", "logTest.log")) + ioutil.WriteFile(addedFile, input, 0664) + + newChunks := make(chan []byte, 16) + oldChunks := make(chan Chunk, 16) + files = listFiles(dataDir) + repo := NewRepo(resultDir) + versions := repo.loadVersions() + go loadChunks(versions, oldChunks) + go readFiles(files, newChunks) + hashes := hashChunks(oldChunks) + recipe := repo.matchChunks(newChunks, hashes) + buff := new(bytes.Buffer) + bsdiff.Reader(recipe[2], recipe[0], buff) + if len(buff.Bytes()) >= chunkSize { + t.Errorf("Bsdiff of chunk is too large: %d", len(buff.Bytes())) + } + + os.Remove(addedFile) +} |