From 129a86b3a6780b7aee5a7469cc5adeaf2ea6c20f Mon Sep 17 00:00:00 2001 From: n-peugnet Date: Fri, 27 Aug 2021 18:38:07 +0200 Subject: add findSimilarChunks method to test sketches Still missing a real test... --- sketch.go | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'sketch.go') diff --git a/sketch.go b/sketch.go index c5f0838..f226661 100644 --- a/sketch.go +++ b/sketch.go @@ -3,18 +3,23 @@ package main import ( "encoding/binary" "io" + "log" "github.com/chmduquesne/rollinghash/rabinkarp64" ) +type Sketch []uint64 + +const fBytes = 8 + // SketchChunk produces a sketch for a chunk based on wSize: the window size, // sfCount: the number of super-features, and fCount: the number of feature // per super-feature -func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, error) { +func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) (Sketch, error) { var fSize = chunkSize / (sfCount * fCount) superfeatures := make([]uint64, 0, sfCount) features := make([]uint64, 0, fCount) - buff := make([]byte, 8*fCount) + buff := make([]byte, fBytes*fCount) r, err := chunk.Reader() if err != nil { return nil, err @@ -24,7 +29,10 @@ func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, err features = features[:0] for f := 0; f < fCount; f++ { hasher.Reset() - io.CopyN(hasher, r, int64(wSize)) + n, err := io.CopyN(hasher, r, int64(wSize)) + if err != nil { + log.Println(n, err) + } max := hasher.Sum64() for w := 0; w < fSize-wSize; w++ { b, _ := r.ReadByte() @@ -37,7 +45,7 @@ func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, err features = append(features, max) } for i, f := range features { - binary.LittleEndian.PutUint64(buff[i*8:i*8+8], f) + binary.LittleEndian.PutUint64(buff[i*fBytes:(i+1)*fBytes], f) } hasher.Reset() hasher.Write(buff) -- cgit v1.2.3