aboutsummaryrefslogtreecommitdiff
path: root/sketch.go
diff options
context:
space:
mode:
Diffstat (limited to 'sketch.go')
-rw-r--r--sketch.go47
1 files changed, 47 insertions, 0 deletions
diff --git a/sketch.go b/sketch.go
new file mode 100644
index 0000000..c5f0838
--- /dev/null
+++ b/sketch.go
@@ -0,0 +1,47 @@
+package main
+
+import (
+ "encoding/binary"
+ "io"
+
+ "github.com/chmduquesne/rollinghash/rabinkarp64"
+)
+
+// SketchChunk produces a sketch for a chunk based on wSize: the window size,
+// sfCount: the number of super-features, and fCount: the number of feature
+// per super-feature
+func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, error) {
+ var fSize = chunkSize / (sfCount * fCount)
+ superfeatures := make([]uint64, 0, sfCount)
+ features := make([]uint64, 0, fCount)
+ buff := make([]byte, 8*fCount)
+ r, err := chunk.Reader()
+ if err != nil {
+ return nil, err
+ }
+ hasher := rabinkarp64.New()
+ for sf := 0; sf < sfCount; sf++ {
+ features = features[:0]
+ for f := 0; f < fCount; f++ {
+ hasher.Reset()
+ io.CopyN(hasher, r, int64(wSize))
+ max := hasher.Sum64()
+ for w := 0; w < fSize-wSize; w++ {
+ b, _ := r.ReadByte()
+ hasher.Roll(b)
+ h := hasher.Sum64()
+ if h > max {
+ max = h
+ }
+ }
+ features = append(features, max)
+ }
+ for i, f := range features {
+ binary.LittleEndian.PutUint64(buff[i*8:i*8+8], f)
+ }
+ hasher.Reset()
+ hasher.Write(buff)
+ superfeatures = append(superfeatures, hasher.Sum64())
+ }
+ return superfeatures, nil
+}