From df6d5f7e24a290718adf8f068649c3bc61f5eb4d Mon Sep 17 00:00:00 2001
From: n-peugnet <n.peugnet@free.fr>
Date: Tue, 24 Aug 2021 18:31:13 +0200
Subject: refactor: extract chunk.og & add Reader getter

---
 TODO.md      | 11 +++++++++++
 chunk.go     | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 const.go     | 11 +++++++++++
 repo.go      | 42 +++++++++---------------------------------
 repo_test.go | 13 +++++++++----
 5 files changed, 87 insertions(+), 37 deletions(-)
 create mode 100644 TODO.md
 create mode 100644 chunk.go
 create mode 100644 const.go

diff --git a/TODO.md b/TODO.md
new file mode 100644
index 0000000..eb911f3
--- /dev/null
+++ b/TODO.md
@@ -0,0 +1,11 @@
+priority 1
+----------
+- join non-deduplicated chunks
+  - choose when and how to
+- detect Similar chunks
+  - implement "N-Transform SuperFeature" hash from Shilane-2012
+  - use the hash for detection
+
+priority 2
+----------
+- use more the `Reader` API (which is analoguous to the `IOStream` in Java)
diff --git a/chunk.go b/chunk.go
new file mode 100644
index 0000000..9f09e55
--- /dev/null
+++ b/chunk.go
@@ -0,0 +1,47 @@
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"path"
+)
+
+type ChunkId struct {
+	Ver int
+	Idx uint64
+}
+
+func (i *ChunkId) Reader(repo string) io.Reader {
+	p := path.Join(repo, fmt.Sprintf(versionFmt, i.Ver), chunksName, fmt.Sprintf(chunkIdFmt, i.Idx))
+	f, err := os.Open(p)
+	if err != nil {
+		log.Printf("Cannot open chunk %s\n", p)
+	}
+	return f
+}
+
+type Chunk struct {
+	Id    *ChunkId
+	Value []byte
+}
+
+func (c *Chunk) Reader(repo string) (io.Reader, error) {
+	if c.Value != nil {
+		return bytes.NewReader(c.Value), nil
+	}
+	if c.Id != nil {
+		return c.Id.Reader(repo), nil
+	}
+	return nil, &ChunkError{"Uninitialized chunk"}
+}
+
+type ChunkError struct {
+	err string
+}
+
+func (e *ChunkError) Error() string {
+	return fmt.Sprintf("Chunk error: %s", e.err)
+}
diff --git a/const.go b/const.go
new file mode 100644
index 0000000..700d3df
--- /dev/null
+++ b/const.go
@@ -0,0 +1,11 @@
+package main
+
+// Defined as var to prevent from using them as const as I want to keep
+// beeing able to change tkem at runtime.
+var (
+	chunkSize  = 8 << 10
+	chunksName = "chunks"
+	chunkIdFmt = "%015d"
+	versionFmt = "%05d"
+	filesName  = "files"
+)
diff --git a/repo.go b/repo.go
index dd96600..3bb60b9 100644
--- a/repo.go
+++ b/repo.go
@@ -25,7 +25,6 @@ repo/
 package main
 
 import (
-	"bytes"
 	"encoding/gob"
 	"fmt"
 	"hash"
@@ -39,10 +38,6 @@ import (
 	"github.com/chmduquesne/rollinghash/rabinkarp64"
 )
 
-var chunkSize = 8 << 10
-var versionFmt = "%05d"
-var chunkIdFmt = "%015d"
-
 type Repo struct {
 	path string
 }
@@ -52,16 +47,6 @@ type File struct {
 	Size int64
 }
 
-type ChunkId struct {
-	Ver int
-	Idx uint64
-}
-
-type Chunk struct {
-	Id    ChunkId
-	Value []byte
-}
-
 func NewRepo(path string) *Repo {
 	os.MkdirAll(path, 0775)
 	return &Repo{path}
@@ -71,8 +56,8 @@ func (r *Repo) Commit(source string) {
 	versions := r.loadVersions()
 	newVersion := len(versions)
 	newPath := path.Join(r.path, fmt.Sprintf(versionFmt, newVersion))
-	newChunkPath := path.Join(newPath, "chunks")
-	newFilesPath := path.Join(newPath, "files")
+	newChunkPath := path.Join(newPath, chunksName)
+	newFilesPath := path.Join(newPath, filesName)
 	os.Mkdir(newPath, 0775)
 	os.Mkdir(newChunkPath, 0775)
 	newChunks := make(chan []byte, 16)
@@ -188,7 +173,7 @@ func storeChunks(dest string, chunks <-chan []byte) {
 
 func loadChunks(versions []string, chunks chan<- Chunk) {
 	for i, v := range versions {
-		p := path.Join(v, "chunks")
+		p := path.Join(v, chunksName)
 		entries, err := os.ReadDir(p)
 		if err != nil {
 			log.Printf("Error reading version '%05d' in '%s' chunks: %s", i, v, err)
@@ -203,7 +188,7 @@ func loadChunks(versions []string, chunks chan<- Chunk) {
 				log.Printf("Error reading chunk '%s': %s", f, err.Error())
 			}
 			c := Chunk{
-				Id: ChunkId{
+				Id: &ChunkId{
 					Ver: i,
 					Idx: uint64(j),
 				},
@@ -222,15 +207,15 @@ func hashChunks(chunks <-chan Chunk) map[uint64]ChunkId {
 		hasher.Reset()
 		hasher.Write(c.Value)
 		h := hasher.Sum64()
-		hashes[h] = c.Id
+		hashes[h] = *c.Id
 	}
 	return hashes
 }
 
-func (r *Repo) matchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) []io.Reader {
+func (r *Repo) matchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) []Chunk {
 	hasher := rabinkarp64.New()
 	hasher.Write(<-chunks)
-	recipe := make([]io.Reader, 0)
+	recipe := make([]Chunk, 0)
 
 	var i uint64
 	var offset, prefill, postfill int
@@ -258,10 +243,10 @@ func (r *Repo) matchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) []io
 			hasher.Roll(c[postfill])
 		}
 		if len(buff) > 0 {
-			recipe = append(recipe, bytes.NewReader(buff))
+			recipe = append(recipe, Chunk{Value: buff})
 		}
 		if exists {
-			recipe = append(recipe, chunkId2Reader(chunkId, r.path))
+			recipe = append(recipe, Chunk{Id: &chunkId})
 		}
 		offset %= chunkSize
 		i++
@@ -269,15 +254,6 @@ func (r *Repo) matchChunks(chunks <-chan []byte, hashes map[uint64]ChunkId) []io
 	return recipe
 }
 
-func chunkId2Reader(c ChunkId, repo string) io.Reader {
-	p := path.Join(repo, fmt.Sprintf(versionFmt, c.Ver), "chunks", fmt.Sprintf(chunkIdFmt, c.Idx))
-	f, err := os.Open(p)
-	if err != nil {
-		log.Printf("Cannot open chunk %s\n", p)
-	}
-	return f
-}
-
 func writeFile(filePath string, object interface{}) error {
 	file, err := os.Create(filePath)
 	if err == nil {
diff --git a/repo_test.go b/repo_test.go
index bde2a63..d25ab8a 100644
--- a/repo_test.go
+++ b/repo_test.go
@@ -98,7 +98,7 @@ func TestLoadChunks(t *testing.T) {
 	resultDir := prepareResult()
 	dataDir := path.Join("test", "data")
 	resultVersion := path.Join(resultDir, "00000")
-	resultChunks := path.Join(resultVersion, "chunks")
+	resultChunks := path.Join(resultVersion, chunksName)
 	os.MkdirAll(resultChunks, 0775)
 	chunks1 := make(chan []byte, 16)
 	chunks2 := make(chan []byte, 16)
@@ -125,7 +125,7 @@ func TestLoadChunks(t *testing.T) {
 func TestStoreLoadFiles(t *testing.T) {
 	resultDir := prepareResult()
 	dataDir := path.Join("test", "data")
-	resultFiles := path.Join(resultDir, "files")
+	resultFiles := path.Join(resultDir, filesName)
 	files1 := listFiles(dataDir)
 	storeFiles(resultFiles, files1)
 	files2 := loadFiles(resultFiles)
@@ -143,7 +143,7 @@ func TestBsdiff(t *testing.T) {
 	dataDir := path.Join("test", "data")
 	addedFile := path.Join(dataDir, "logs.2", "slogTest.log")
 	resultVersion := path.Join(resultDir, "00000")
-	resultChunks := path.Join(resultVersion, "chunks")
+	resultChunks := path.Join(resultVersion, chunksName)
 	os.MkdirAll(resultChunks, 0775)
 	chunks := make(chan []byte, 16)
 	files := listFiles(dataDir)
@@ -163,7 +163,12 @@ func TestBsdiff(t *testing.T) {
 	hashes := hashChunks(oldChunks)
 	recipe := repo.matchChunks(newChunks, hashes)
 	buff := new(bytes.Buffer)
-	bsdiff.Reader(recipe[2], recipe[0], buff)
+	r2, _ := recipe[2].Reader(repo.path)
+	r0, _ := recipe[0].Reader(repo.path)
+	bsdiff.Reader(r2, r0, buff)
+	if len(buff.Bytes()) < 500 {
+		t.Errorf("Bsdiff of chunk is too small: %d", len(buff.Bytes()))
+	}
 	if len(buff.Bytes()) >= chunkSize {
 		t.Errorf("Bsdiff of chunk is too large: %d", len(buff.Bytes()))
 	}
-- 
cgit v1.2.3