From c481eb2b44adf50b62de3b9e3355f64973967d52 Mon Sep 17 00:00:00 2001 From: n-peugnet Date: Tue, 31 Aug 2021 12:05:29 +0200 Subject: do not fill partial cunks with padding this way a partial chunk may have less superfeatures than a complete one --- sketch.go | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'sketch.go') diff --git a/sketch.go b/sketch.go index db7e4e6..9910848 100644 --- a/sketch.go +++ b/sketch.go @@ -18,31 +18,30 @@ const fBytes = 8 func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) (Sketch, error) { var fSize = chunkSize / (sfCount * fCount) superfeatures := make([]uint64, 0, sfCount) - features := make([]uint64, 0, fCount) + features := make([]uint64, 0, fCount*sfCount) buff := make([]byte, fBytes*fCount) r := chunk.Reader() hasher := rabinkarp64.New() - for sf := 0; sf < sfCount; sf++ { - features = features[:0] - for f := 0; f < fCount; f++ { - hasher.Reset() - n, err := io.CopyN(hasher, r, int64(wSize)) - if err != nil { - log.Println(n, err) - } - max := hasher.Sum64() - for w := 0; w < fSize-wSize; w++ { - b, _ := r.ReadByte() - hasher.Roll(b) - h := hasher.Sum64() - if h > max { - max = h - } + for f := 0; f < chunk.Len()/fSize; f++ { + hasher.Reset() + n, err := io.CopyN(hasher, r, int64(wSize)) + if err != nil { + log.Println(n, err) + } + max := hasher.Sum64() + for w := 0; w < fSize-wSize; w++ { + b, _ := r.ReadByte() + hasher.Roll(b) + h := hasher.Sum64() + if h > max { + max = h } - features = append(features, max) } - for i, f := range features { - binary.LittleEndian.PutUint64(buff[i*fBytes:(i+1)*fBytes], f) + features = append(features, max) + } + for sf := 0; sf < len(features)/fCount; sf++ { + for i := 0; i < fCount; i++ { + binary.LittleEndian.PutUint64(buff[i*fBytes:(i+1)*fBytes], features[i+sf*fCount]) } hasher.Reset() hasher.Write(buff) -- cgit v1.2.3