1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
package main
import (
"encoding/binary"
"io"
"log"
"github.com/chmduquesne/rollinghash/rabinkarp64"
)
type Sketch []uint64
const fBytes = 8
// SketchChunk produces a sketch for a chunk based on wSize: the window size,
// sfCount: the number of super-features, and fCount: the number of feature
// per super-feature
func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) (Sketch, error) {
var fSize = chunkSize / (sfCount * fCount)
superfeatures := make([]uint64, 0, sfCount)
features := make([]uint64, 0, fCount)
buff := make([]byte, fBytes*fCount)
r, err := chunk.Reader()
if err != nil {
return nil, err
}
hasher := rabinkarp64.New()
for sf := 0; sf < sfCount; sf++ {
features = features[:0]
for f := 0; f < fCount; f++ {
hasher.Reset()
n, err := io.CopyN(hasher, r, int64(wSize))
if err != nil {
log.Println(n, err)
}
max := hasher.Sum64()
for w := 0; w < fSize-wSize; w++ {
b, _ := r.ReadByte()
hasher.Roll(b)
h := hasher.Sum64()
if h > max {
max = h
}
}
features = append(features, max)
}
for i, f := range features {
binary.LittleEndian.PutUint64(buff[i*fBytes:(i+1)*fBytes], f)
}
hasher.Reset()
hasher.Write(buff)
superfeatures = append(superfeatures, hasher.Sum64())
}
return superfeatures, nil
}
|