From 97638899fa9a160b1cc83a57d1c41d89f5763f49 Mon Sep 17 00:00:00 2001 From: n-peugnet Date: Mon, 23 Aug 2021 11:44:54 +0200 Subject: better rolling hash match If a chunk matched, directly skip to the next chunk instead of calculating every single byte by byte checksum in it. --- repo.go | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/repo.go b/repo.go index daf6a0e..37f0eda 100644 --- a/repo.go +++ b/repo.go @@ -208,18 +208,30 @@ func MatchChunks(chunks <-chan []byte, hashes map[uint64]uint64) { hasher := rabinkarp64.New() hasher.Write(<-chunks) - var i uint64 = 0 + var i uint64 + var offset int + var prefill int + var postfill int for c := range chunks { - for offset, b := range c { - + // Pre fill the window with the rest of the previous chunk + for prefill = 0; prefill < offset; prefill++ { + hasher.Roll(c[prefill]) + } + // Fill the window with the current chunk and match hash byte by byte + for ; offset < len(c); offset++ { h := hasher.Sum64() chunk, exists := hashes[h] if exists { fmt.Printf("Found existing chunk. New{id:%d, offset:%d}, Old: %d\n", i, offset, chunk) + break } - // Roll the incoming byte in rolling - hasher.Roll(b) + hasher.Roll(c[offset]) + } + // Fill the window with the rest of the current chunk if it matched early + for postfill = offset; postfill < len(c); postfill++ { + hasher.Roll(c[postfill]) } + offset %= chunkSize i++ } } -- cgit v1.2.3