aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--chunk.go28
-rw-r--r--repo.go10
-rw-r--r--repo_test.go10
-rw-r--r--sketch.go39
-rw-r--r--sketch_test.go10
-rw-r--r--test/data/repo_8k/00000/chunks/000000000000014bin8192 -> 4710 bytes
6 files changed, 64 insertions, 33 deletions
diff --git a/chunk.go b/chunk.go
index 92e3563..78e689d 100644
--- a/chunk.go
+++ b/chunk.go
@@ -17,6 +17,7 @@ type ChunkReader interface {
type Chunk interface {
Reader() ChunkReader
+ Len() int
}
type StoredChunk interface {
@@ -29,11 +30,15 @@ type ChunkId struct {
Idx uint64
}
+func (i *ChunkId) Path(repo string) string {
+ return path.Join(repo, fmt.Sprintf(versionFmt, i.Ver), chunksName, fmt.Sprintf(chunkIdFmt, i.Idx))
+}
+
func (i *ChunkId) Reader(repo string) ChunkReader {
- p := path.Join(repo, fmt.Sprintf(versionFmt, i.Ver), chunksName, fmt.Sprintf(chunkIdFmt, i.Idx))
- f, err := os.Open(p)
+ path := i.Path(repo)
+ f, err := os.Open(path)
if err != nil {
- log.Printf("Cannot open chunk %s\n", p)
+ log.Println("Cannot open chunk: ", path)
}
return bufio.NewReaderSize(f, chunkSize)
}
@@ -56,6 +61,10 @@ func (c *LoadedChunk) Reader() ChunkReader {
return bytes.NewReader(c.value)
}
+func (c *LoadedChunk) Len() int {
+ return len(c.value)
+}
+
func NewChunkFile(repo *Repo, id *ChunkId) *ChunkFile {
return &ChunkFile{repo: repo, id: id}
}
@@ -74,6 +83,15 @@ func (c *ChunkFile) Reader() ChunkReader {
return c.id.Reader(c.repo.path)
}
+func (c *ChunkFile) Len() int {
+ path := c.id.Path(c.repo.path)
+ info, err := os.Stat(path)
+ if err != nil {
+ log.Println("Chunk: could not stat file:", path)
+ }
+ return int(info.Size())
+}
+
func NewTempChunk(value []byte) *TempChunk {
return &TempChunk{value: value}
}
@@ -85,3 +103,7 @@ type TempChunk struct {
func (c *TempChunk) Reader() ChunkReader {
return bytes.NewReader(c.value)
}
+
+func (c *TempChunk) Len() int {
+ return len(c.value)
+}
diff --git a/repo.go b/repo.go
index a505859..55e830f 100644
--- a/repo.go
+++ b/repo.go
@@ -145,7 +145,7 @@ func chunkStream(stream io.Reader, chunks chan<- []byte) {
}
}
if prev != chunkSize {
- chunks <- buff
+ chunks <- buff[:prev]
}
close(chunks)
}
@@ -290,9 +290,7 @@ func (r *Repo) matchStream(stream io.Reader, fingerprints FingerprintMap) []Chun
if len(buff) > chunkSize && len(buff) < chunkSize*2 {
size := len(buff) - chunkSize
log.Println("Add new partial chunk of size:", size)
- tmp := make([]byte, 0, chunkSize)
- tmp = append(tmp, buff[:size]...)
- chunks = append(chunks, NewTempChunk(tmp[:chunkSize]))
+ chunks = append(chunks, NewTempChunk(buff[:size]))
}
log.Printf("Add existing chunk: %d\n", chunkId)
chunks = append(chunks, NewChunkFile(r, chunkId))
@@ -321,10 +319,10 @@ func (r *Repo) matchStream(stream io.Reader, fingerprints FingerprintMap) []Chun
log.Println("Add new chunk")
chunks = append(chunks, NewTempChunk(buff[:chunkSize]))
log.Println("Add new partial chunk of size:", len(buff)-chunkSize)
- chunks = append(chunks, NewTempChunk(buff[chunkSize:chunkSize*2]))
+ chunks = append(chunks, NewTempChunk(buff[chunkSize:]))
} else if len(buff) > 0 {
log.Println("Add new partial chunk of size:", len(buff))
- chunks = append(chunks, NewTempChunk(buff[chunkSize:chunkSize*2]))
+ chunks = append(chunks, NewTempChunk(buff))
}
return chunks
}
diff --git a/repo_test.go b/repo_test.go
index cbb54df..cdd3024 100644
--- a/repo_test.go
+++ b/repo_test.go
@@ -35,16 +35,18 @@ func chunkCompare(t *testing.T, dataDir string, testFiles []string, chunkCount i
i := 0
for c := range chunks {
- content := buff[i*chunkSize : (i+1)*chunkSize]
- if len(c) != chunkSize {
- t.Errorf("Chunk %d is not of chunkSize: %d", i, chunkSize)
+ start := i * chunkSize
+ end := (i + 1) * chunkSize
+ if end > offset {
+ end = offset
}
+ content := buff[start:end]
if bytes.Compare(c, content) != 0 {
t.Errorf("Chunk %d does not match file content", i)
// for i, b := range c {
// fmt.Printf("E: %d, A: %d\n", b, content[i])
// }
- t.Log("Expected: ", c[:10], "...", c[chunkSize-10:])
+ t.Log("Expected: ", c[:10], "...", c[end%chunkSize-10:])
t.Log("Actual:", content)
}
i++
diff --git a/sketch.go b/sketch.go
index db7e4e6..9910848 100644
--- a/sketch.go
+++ b/sketch.go
@@ -18,31 +18,30 @@ const fBytes = 8
func SketchChunk(chunk Chunk, wSize int, sfCount int, fCount int) (Sketch, error) {
var fSize = chunkSize / (sfCount * fCount)
superfeatures := make([]uint64, 0, sfCount)
- features := make([]uint64, 0, fCount)
+ features := make([]uint64, 0, fCount*sfCount)
buff := make([]byte, fBytes*fCount)
r := chunk.Reader()
hasher := rabinkarp64.New()
- for sf := 0; sf < sfCount; sf++ {
- features = features[:0]
- for f := 0; f < fCount; f++ {
- hasher.Reset()
- n, err := io.CopyN(hasher, r, int64(wSize))
- if err != nil {
- log.Println(n, err)
- }
- max := hasher.Sum64()
- for w := 0; w < fSize-wSize; w++ {
- b, _ := r.ReadByte()
- hasher.Roll(b)
- h := hasher.Sum64()
- if h > max {
- max = h
- }
+ for f := 0; f < chunk.Len()/fSize; f++ {
+ hasher.Reset()
+ n, err := io.CopyN(hasher, r, int64(wSize))
+ if err != nil {
+ log.Println(n, err)
+ }
+ max := hasher.Sum64()
+ for w := 0; w < fSize-wSize; w++ {
+ b, _ := r.ReadByte()
+ hasher.Roll(b)
+ h := hasher.Sum64()
+ if h > max {
+ max = h
}
- features = append(features, max)
}
- for i, f := range features {
- binary.LittleEndian.PutUint64(buff[i*fBytes:(i+1)*fBytes], f)
+ features = append(features, max)
+ }
+ for sf := 0; sf < len(features)/fCount; sf++ {
+ for i := 0; i < fCount; i++ {
+ binary.LittleEndian.PutUint64(buff[i*fBytes:(i+1)*fBytes], features[i+sf*fCount])
}
hasher.Reset()
hasher.Write(buff)
diff --git a/sketch_test.go b/sketch_test.go
index 86a2c58..074bffd 100644
--- a/sketch_test.go
+++ b/sketch_test.go
@@ -24,6 +24,16 @@ func TestSketchChunk(t *testing.T) {
t.Errorf("Sketch does not match, expected: %d, actual: %d", expected, sketch)
}
}
+ if i == 14 {
+ sketch, err := SketchChunk(c, 32, 3, 4)
+ if err != nil {
+ t.Error(err)
+ }
+ expected := Sketch{658454504014104}
+ if !reflect.DeepEqual(sketch, expected) {
+ t.Errorf("Sketch does not match, expected: %d, actual: %d", expected, sketch)
+ }
+ }
i++
}
}
diff --git a/test/data/repo_8k/00000/chunks/000000000000014 b/test/data/repo_8k/00000/chunks/000000000000014
index d616ce9..ab7db22 100644
--- a/test/data/repo_8k/00000/chunks/000000000000014
+++ b/test/data/repo_8k/00000/chunks/000000000000014
Binary files differ