aboutsummaryrefslogtreecommitdiff
path: root/sketch.go
diff options
context:
space:
mode:
authorn-peugnet <n.peugnet@free.fr>2021-08-27 18:38:07 +0200
committern-peugnet <n.peugnet@free.fr>2021-08-27 18:38:16 +0200
commit129a86b3a6780b7aee5a7469cc5adeaf2ea6c20f (patch)
treeab8423f6885c380b2bb4d807313428003d8d5e37 /sketch.go
parent78251f11c91b2504edfc02b760ef53bd352b856c (diff)
downloaddna-backup-129a86b3a6780b7aee5a7469cc5adeaf2ea6c20f.tar.gz
dna-backup-129a86b3a6780b7aee5a7469cc5adeaf2ea6c20f.zip
add findSimilarChunks method to test sketches
Still missing a real test...
Diffstat (limited to 'sketch.go')
-rw-r--r--sketch.go16
1 files changed, 12 insertions, 4 deletions
diff --git a/sketch.go b/sketch.go
index c5f0838..f226661 100644
--- a/sketch.go
+++ b/sketch.go
@@ -3,18 +3,23 @@ package main
import (
"encoding/binary"
"io"
+ "log"
"github.com/chmduquesne/rollinghash/rabinkarp64"
)
+type Sketch []uint64
+
+const fBytes = 8
+
// SketchChunk produces a sketch for a chunk based on wSize: the window size,
// sfCount: the number of super-features, and fCount: the number of feature
// per super-feature
-func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, error) {
+func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) (Sketch, error) {
var fSize = chunkSize / (sfCount * fCount)
superfeatures := make([]uint64, 0, sfCount)
features := make([]uint64, 0, fCount)
- buff := make([]byte, 8*fCount)
+ buff := make([]byte, fBytes*fCount)
r, err := chunk.Reader()
if err != nil {
return nil, err
@@ -24,7 +29,10 @@ func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, err
features = features[:0]
for f := 0; f < fCount; f++ {
hasher.Reset()
- io.CopyN(hasher, r, int64(wSize))
+ n, err := io.CopyN(hasher, r, int64(wSize))
+ if err != nil {
+ log.Println(n, err)
+ }
max := hasher.Sum64()
for w := 0; w < fSize-wSize; w++ {
b, _ := r.ReadByte()
@@ -37,7 +45,7 @@ func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) ([]uint64, err
features = append(features, max)
}
for i, f := range features {
- binary.LittleEndian.PutUint64(buff[i*8:i*8+8], f)
+ binary.LittleEndian.PutUint64(buff[i*fBytes:(i+1)*fBytes], f)
}
hasher.Reset()
hasher.Write(buff)