From 212ce5c37857a2d64c56e3df8f415f48b8698462 Mon Sep 17 00:00:00 2001 From: n-peugnet Date: Mon, 23 Aug 2021 11:19:23 +0200 Subject: initial rolling hash matching --- go.mod | 2 ++ go.sum | 2 ++ repo.go | 45 ++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 go.sum diff --git a/go.mod b/go.mod index cdc2a48..d606d25 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module github.com/n-peugnet/dna-backup go 1.16 + +require github.com/chmduquesne/rollinghash v4.0.0+incompatible // indirect diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..c9d9a48 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +github.com/chmduquesne/rollinghash v4.0.0+incompatible h1:hnREQO+DXjqIw3rUTzWN7/+Dpw+N5Um8zpKV0JOEgbo= +github.com/chmduquesne/rollinghash v4.0.0+incompatible/go.mod h1:Uc2I36RRfTAf7Dge82bi3RU0OQUmXT9iweIcPqvr8A0= diff --git a/repo.go b/repo.go index 3a91407..daf6a0e 100644 --- a/repo.go +++ b/repo.go @@ -27,6 +27,7 @@ package main import ( "encoding/gob" "fmt" + "hash" "io" "io/fs" "io/ioutil" @@ -35,6 +36,8 @@ import ( "path" "path/filepath" "strconv" + + "github.com/chmduquesne/rollinghash/rabinkarp64" ) var chunkSize = 8 << 10 @@ -49,7 +52,7 @@ func Commit(source string, repo string) { new := latest + 1 newPath := path.Join(repo, fmt.Sprintf("%05d", new)) newChunkPath := path.Join(newPath, "chunks") - newFilesPath := path.Join(newPath, "files") + // newFilesPath := path.Join(newPath, "files") os.Mkdir(newPath, 0775) os.Mkdir(newChunkPath, 0775) newChunks := make(chan []byte, 16) @@ -57,8 +60,10 @@ func Commit(source string, repo string) { files := ListFiles(source) go LoadChunks(repo, oldChunks) go ReadFiles(files, newChunks) - StoreChunks(newChunkPath, newChunks) - StoreFiles(newFilesPath, files) + hashes := HashChunks(oldChunks) + MatchChunks(newChunks, hashes) + // StoreChunks(newChunkPath, newChunks) + // StoreFiles(newFilesPath, files) fmt.Println(files) } @@ -185,6 +190,40 @@ func LoadChunks(repo string, chunks chan<- []byte) { close(chunks) } +func HashChunks(chunks <-chan []byte) map[uint64]uint64 { + hashes := make(map[uint64]uint64) + hasher := hash.Hash64(rabinkarp64.New()) + var i uint64 = 0 + for c := range chunks { + hasher.Reset() + hasher.Write(c) + h := hasher.Sum64() + hashes[h] = i + i++ + } + return hashes +} + +func MatchChunks(chunks <-chan []byte, hashes map[uint64]uint64) { + hasher := rabinkarp64.New() + hasher.Write(<-chunks) + + var i uint64 = 0 + for c := range chunks { + for offset, b := range c { + + h := hasher.Sum64() + chunk, exists := hashes[h] + if exists { + fmt.Printf("Found existing chunk. New{id:%d, offset:%d}, Old: %d\n", i, offset, chunk) + } + // Roll the incoming byte in rolling + hasher.Roll(b) + } + i++ + } +} + func writeFile(filePath string, object interface{}) error { file, err := os.Create(filePath) if err == nil { -- cgit v1.2.3