aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--TODO.md15
-rw-r--r--docs/go.mod1
-rw-r--r--docs/note-2021-10-07.md155
-rw-r--r--exp/.gitignore1
-rw-r--r--exp/go.mod1
-rw-r--r--main.go10
-rw-r--r--repo/repo.go16
-rw-r--r--repo/repo_test.go22
8 files changed, 188 insertions, 33 deletions
diff --git a/TODO.md b/TODO.md
index 451d51b..12c4686 100644
--- a/TODO.md
+++ b/TODO.md
@@ -5,13 +5,6 @@ priority 1
chunks if no similar chunk is found. Thus, it will need to be of
`chunkSize` or less. Otherwise, it could not be possibly used for
deduplication.
-- [ ] read individual files
-- [ ] properly store information to be DNA encoded
- - [ ] tar source to keep files metadata ?
- - [x] store chunks compressed
- - [x] compress before storing
- - [x] decompress before loading
- - [ ] store compressed chunks into tracks of `trackSize` (1024o)
- [x] add chunk cache to uniquely store chunks in RAM
- [x] better tests for `(*Repo).Commit`
- [x] remove errored files from `fileList`
@@ -20,15 +13,15 @@ priority 1
- [ ] add version blocks.
- [x] command line with subcommands (like, hmm... git ? for instance).
- experiences:
- - [ ] compare against UDF
+ - [ ] compare against UDF (this will not be possible, unless we use a real
+ CR-ROM)
- [ ] make multiple repo versions with multiple parameters
- smaller block size
priority 2
----------
-- [ ] use more the `Reader` API (which is analogous to the `IOStream` in Java)
+- [ ] read individual files
- [ ] refactor `matchStream` as right now it is quite complex
-- [x] better test for `(*Repo).matchStream`
- [ ] tail packing of `PartialChunks` (this Struct does not exist yet as it is
in fact just `TempChunks` for now).
This might not be useful if we store the recipe incrementally.
@@ -68,7 +61,7 @@ reunion 7/09
- [x] store file list incrementally.
- [x] compress recipe
- [x] compress file list
-- [ ] make size comparison between recipe and chunks with some datasets
+- [x] make size comparison between recipe and chunks with some datasets
ideas
-----
diff --git a/docs/go.mod b/docs/go.mod
new file mode 100644
index 0000000..f4162d5
--- /dev/null
+++ b/docs/go.mod
@@ -0,0 +1 @@
+// empty go mod to make go cli skip this directory
diff --git a/docs/note-2021-10-07.md b/docs/note-2021-10-07.md
new file mode 100644
index 0000000..476964e
--- /dev/null
+++ b/docs/note-2021-10-07.md
@@ -0,0 +1,155 @@
+Run exp on Multiple git repos
+=============================
+
+Linux kernel
+------------
+
+```
+11:53:07.681 check out 224426f168aa4af3dcb628e6edaa824d32d60e6f
+11:53:15.471 create diff for this version
+11:53:30.163 create backup for this version
+11:54:34.346 restore from diffs
+11:54:38.244 check restore from diffs
+11:54:38.880 restore from backup
+11:54:43.776 check restore from backup
+11:54:44.397 check out dbe79bbe9dcb22cb3651c46f18943477141ca452
+11:55:01.762 create diff for this version
+11:56:19.545 create backup for this version
+11:58:13.244 check out ceeee1fb2897651b434547eb26d93e6d2ff5a1a5
+11:58:27.085 create diff for this version
+12:00:14.362 create backup for this version
+12:01:09.252 check out f35723ec48ca60f2f3493ea40d63a9bc5b585c28
+12:01:22.871 create diff for this version
+12:01:26.590 create backup for this version
+12:02:08.992 check out d2cb1a95c5fa4d1691c90a4f530955b4ea3cfa24
+12:02:22.320 create diff for this version
+12:02:34.623 create backup for this version
+12:03:28.304 restore from diffs
+12:03:52.677 check restore from diffs
+12:03:53.166 restore from backup
+12:03:57.937 check restore from backup
+12:03:58.427 check out 03d782524e2d0511317769521c8d5daadbab8482
+12:04:12.099 create diff for this version
+12:04:19.500 create backup for this version
+12:05:01.339 check out 1cbf4c563c0eaaf11c552a88b374e213181c6ddd
+12:05:13.805 create diff for this version
+12:05:18.433 create backup for this version
+12:06:13.891 check out da28438cae9a271c5c232177f81dfb243de9b7fa
+12:06:26.596 create diff for this version
+12:06:32.695 create backup for this version
+12:07:10.455 check out d4b9ba7bf6f38cff55b5d95a0db7dd91311ce20a
+12:07:23.631 create diff for this version
+12:07:23.663 create backup for this version
+12:07:43.166 restore from diffs
+12:08:15.771 check restore from diffs
+12:08:16.253 restore from backup
+12:08:21.572 check restore from backup
+12:08:22.035 check out 367636772f094fd840d2d79e75257bcfaa28e70f
+12:08:32.149 create diff for this version
+12:08:32.270 create backup for this version
+12:08:55.599 check out f50f3ac51983025405a71b70b033cc6bcb0d1fc1
+12:09:05.597 create diff for this version
+12:09:05.623 create backup for this version
+12:09:28.613 check out b59a9504cb93db7fae31e60760725d48652a1fc3
+12:09:40.484 create diff for this version
+12:09:43.885 create backup for this version
+12:10:19.089 check out 69903d6500c73af8329a5fba7153b0d50748981c
+12:10:19.245 create diff for this version
+12:10:19.266 create backup for this version
+12:10:35.016 restore from diffs
+12:11:07.791 check restore from diffs
+12:11:08.286 restore from backup
+12:11:14.264 check restore from backup
+12:11:14.752 check out c264852726dde251a0c09ec22f61a9be8b0db68b
+12:11:14.835 create diff for this version
+12:11:14.855 create backup for this version
+12:11:42.260 check out 0964ce24d091a1d3dc7f667e1b107ab77d4325e6
+12:11:42.340 create diff for this version
+12:11:42.361 create backup for this version
+12:12:09.532 check out b188ffe876382ecc009ceb4fe033fd6ec7ba4ede
+12:12:09.614 create diff for this version
+12:12:09.635 create backup for this version
+12:12:25.965 check out 14f8351a313f364afbc565f1ddcd43f8cfdccf52
+12:12:37.653 create diff for this version
+12:12:41.413 create backup for this version
+12:13:06.960 restore from diffs
+12:13:41.277 check restore from diffs
+12:13:41.740 restore from backup
+12:13:47.783 check restore from backup
+12:13:48.247 check out 07a4e2da7dd3c9345f84b2552872f9d38c257451
+12:13:58.535 create diff for this version
+12:13:58.860 create backup for this version
+12:14:23.049 check out 3834c3f227725e2395840aed82342bda4ee9d379
+12:14:33.557 create diff for this version
+12:14:33.582 create backup for this version
+12:14:58.535 check out 9383292f179e1907e7e7ade539ac8fd3b65c1e97
+12:15:10.629 create diff for this version
+12:15:14.131 create backup for this version
+12:15:46.082 clean up repo
+============== SUMMARY ==============
+backup diffs
+77288903 12926085
+108519248 104610
+43371572 10323162
+17821696 10052904
+40321219 1395
+11047234 20915481
+20219859 639
+11381646 13592417
+131258 368
+243287 231
+7919 238438
+7803303 510
+144214 59771518
+149747 9910136
+4322 87905291
+145265 75958096
+3189502 28806094
+233130 7398260
+6718 2607
+3721760 812
+============== TOTAL ==============
+345751802 337909054
+```
+
+DNA-backup
+----------
+
+```
+============== SUMMARY ==============
+backup diffs
+19280 1451
+3138 638
+2746 4737
+2919 7321
+5430 2665
+6093 1496
+2888 11129
+7577 6004
+5824 2418
+1177 2815
+8396 7551
+8479 5804
+9323 2449
+7234 3397
+7926 2143
+12121 2416
+12273 13301
+12909 3274
+29000 1764
+18634 2162
+24159 9681
+20158 4631
+24669 10147
+25806 4983
+18169 11488
+20882 867
+8063 1768
+4053 4373
+14906 9698
+15514 869
+10193 4188
+6140 23257
+============== TOTAL ==============
+376079 170885
+```
diff --git a/exp/.gitignore b/exp/.gitignore
index 8267a55..093b79b 100644
--- a/exp/.gitignore
+++ b/exp/.gitignore
@@ -1,5 +1,6 @@
*
!.gitignore
!exp.sh
+!go.mod
!Makefile
!README.md
diff --git a/exp/go.mod b/exp/go.mod
new file mode 100644
index 0000000..f4162d5
--- /dev/null
+++ b/exp/go.mod
@@ -0,0 +1 @@
+// empty go mod to make go cli skip this directory
diff --git a/main.go b/main.go
index f1bb1d8..499c4c2 100644
--- a/main.go
+++ b/main.go
@@ -22,8 +22,9 @@ const (
)
var (
- logLevel int
- format string
+ logLevel int
+ chunkSize int
+ format string
)
var commit = command{flag.NewFlagSet("commit", flag.ExitOnError), commitMain,
@@ -51,6 +52,7 @@ func init() {
// setup subcommands
for _, s := range subcommands {
s.Flag.IntVar(&logLevel, "v", 3, "log verbosity level (0-4)")
+ s.Flag.IntVar(&chunkSize, "c", 8<<10, "chunk size")
}
}
@@ -85,7 +87,7 @@ func commitMain(args []string) error {
}
source := args[0]
dest := args[1]
- repo := repo.NewRepo(dest)
+ repo := repo.NewRepo(dest, chunkSize)
repo.Commit(source)
return nil
}
@@ -96,7 +98,7 @@ func restoreMain(args []string) error {
}
source := args[0]
dest := args[1]
- repo := repo.NewRepo(source)
+ repo := repo.NewRepo(source, chunkSize)
repo.Restore(dest)
return nil
}
diff --git a/repo/repo.go b/repo/repo.go
index 678b091..27df54e 100644
--- a/repo/repo.go
+++ b/repo/repo.go
@@ -104,7 +104,7 @@ type File struct {
Link string
}
-func NewRepo(path string) *Repo {
+func NewRepo(path string, chunkSize int) *Repo {
var err error
path, err = filepath.Abs(path)
if err != nil {
@@ -121,7 +121,7 @@ func NewRepo(path string) *Repo {
}
return &Repo{
path: path,
- chunkSize: 8 << 10,
+ chunkSize: chunkSize,
sketchWSize: 32,
sketchSfCount: 3,
sketchFCount: 4,
@@ -155,13 +155,9 @@ func (r *Repo) Commit(source string) {
newChunkPath := filepath.Join(newPath, chunksName)
os.Mkdir(newPath, 0775) // TODO: handle errors
os.Mkdir(newChunkPath, 0775) // TODO: handle errors
- logger.Info("listing files")
files := listFiles(source)
- logger.Info("loading previous hashes")
r.loadHashes(versions)
- logger.Info("loading previous file lists")
r.loadFileLists(versions)
- logger.Info("loading previous recipies")
r.loadRecipes(versions)
storeQueue := make(chan chunkData, 32)
storeEnd := make(chan bool)
@@ -183,7 +179,6 @@ func (r *Repo) Commit(source string) {
func (r *Repo) Restore(destination string) {
versions := r.loadVersions()
- logger.Info("loading previous file lists")
r.loadFileLists(versions)
logger.Info("loading previous recipies")
r.loadRecipes(versions)
@@ -233,6 +228,7 @@ func (r *Repo) loadVersions() []string {
}
func listFiles(path string) []File {
+ logger.Infof("list files from %s", path)
var files []File
err := filepath.Walk(path, func(p string, i fs.FileInfo, err error) error {
if err != nil {
@@ -353,6 +349,7 @@ func storeDelta(prevRaw []byte, curr interface{}, dest string, differ delta.Diff
if err = encoder.Encode(curr); err != nil {
logger.Panic(err)
}
+ logger.Infof("store before delta: %d", currBuff.Len())
file, err := os.Create(dest)
if err != nil {
logger.Panic(err)
@@ -412,12 +409,14 @@ func loadDeltas(target interface{}, versions []string, patcher delta.Patcher, wr
// storeFileList stores the given list in the repo dir as a delta against the
// previous version's one.
func (r *Repo) storeFileList(version int, list []File) {
+ logger.Info("store files")
dest := filepath.Join(r.path, fmt.Sprintf(versionFmt, version), filesName)
storeDelta(r.filesRaw, list, dest, r.differ, r.chunkWriteWrapper)
}
// loadFileLists loads incrementally the file lists' delta of each given version.
func (r *Repo) loadFileLists(versions []string) {
+ logger.Info("load previous file lists")
var files []File
r.filesRaw = loadDeltas(&files, versions, r.patcher, r.chunkReadWrapper, filesName)
r.files = files
@@ -516,6 +515,7 @@ func (r *Repo) loadChunks(versions []string, chunks chan<- IdentifiedChunk) {
// loadHashes loads and aggregates the hashes stored for each given version and
// stores them in the repo maps.
func (r *Repo) loadHashes(versions []string) {
+ logger.Info("load previous hashes")
for i, v := range versions {
path := filepath.Join(v, hashesName)
file, err := os.Open(path)
@@ -740,11 +740,13 @@ func (r *Repo) restoreStream(stream io.WriteCloser, recipe []Chunk) {
}
func (r *Repo) storeRecipe(version int, recipe []Chunk) {
+ logger.Info("store recipe")
dest := filepath.Join(r.path, fmt.Sprintf(versionFmt, version), recipeName)
storeDelta(r.recipeRaw, recipe, dest, r.differ, r.chunkWriteWrapper)
}
func (r *Repo) loadRecipes(versions []string) {
+ logger.Info("load previous recipies")
var recipe []Chunk
r.recipeRaw = loadDeltas(&recipe, versions, r.patcher, r.chunkReadWrapper, recipeName)
for _, c := range recipe {
diff --git a/repo/repo_test.go b/repo/repo_test.go
index 36cc1bd..e3a49c7 100644
--- a/repo/repo_test.go
+++ b/repo/repo_test.go
@@ -157,7 +157,7 @@ func (r *Repo) makeSketch(id *ChunkId, reader io.Reader, wg *sync.WaitGroup, ret
func TestReadFiles1(t *testing.T) {
tmpDir := t.TempDir()
- repo := NewRepo(tmpDir)
+ repo := NewRepo(tmpDir, 8<<10)
chunkCount := 590/repo.chunkSize + 1
dataDir := filepath.Join("testdata", "logs", "1")
files := []string{"logTest.log"}
@@ -166,7 +166,7 @@ func TestReadFiles1(t *testing.T) {
func TestReadFiles2(t *testing.T) {
tmpDir := t.TempDir()
- repo := NewRepo(tmpDir)
+ repo := NewRepo(tmpDir, 8<<10)
chunkCount := 22899/repo.chunkSize + 1
dataDir := filepath.Join("testdata", "logs", "2")
files := []string{"csvParserTest.log", "slipdb.log"}
@@ -175,7 +175,7 @@ func TestReadFiles2(t *testing.T) {
func TestReadFiles3(t *testing.T) {
tmpDir := t.TempDir()
- repo := NewRepo(tmpDir)
+ repo := NewRepo(tmpDir, 8<<10)
chunkCount := 119398/repo.chunkSize + 1
dataDir := filepath.Join("testdata", "logs")
files := []string{
@@ -234,7 +234,7 @@ func TestSymlinks(t *testing.T) {
func TestLoadChunks(t *testing.T) {
resultDir := t.TempDir()
dataDir := filepath.Join("testdata", "logs")
- repo := NewRepo(resultDir)
+ repo := NewRepo(resultDir, 8<<10)
repo.chunkReadWrapper = utils.NopReadWrapper
repo.chunkWriteWrapper = utils.NopWriteWrapper
resultVersion := filepath.Join(resultDir, "00000")
@@ -291,7 +291,7 @@ func TestBsdiff(t *testing.T) {
logger.SetLevel(3)
defer logger.SetLevel(4)
resultDir := t.TempDir()
- repo := NewRepo(resultDir)
+ repo := NewRepo(resultDir, 8<<10)
dataDir := filepath.Join("testdata", "logs")
addedFile1 := filepath.Join(dataDir, "2", "slogTest.log")
addedFile2 := filepath.Join(dataDir, "3", "slogTest.log")
@@ -341,7 +341,7 @@ func TestCommitZlib(t *testing.T) {
dest := t.TempDir()
source := filepath.Join("testdata", "logs")
expected := filepath.Join("testdata", "repo_8k_zlib")
- repo := NewRepo(dest)
+ repo := NewRepo(dest, 8<<10)
repo.patcher = delta.Fdelta{}
repo.differ = delta.Fdelta{}
repo.chunkReadWrapper = utils.ZlibReader
@@ -357,7 +357,7 @@ func TestRestoreZlib(t *testing.T) {
dest := t.TempDir()
source := filepath.Join("testdata", "repo_8k_zlib")
expected := filepath.Join("testdata", "logs")
- repo := NewRepo(source)
+ repo := NewRepo(source, 8<<10)
repo.patcher = delta.Fdelta{}
repo.differ = delta.Fdelta{}
repo.chunkReadWrapper = utils.ZlibReader
@@ -373,8 +373,8 @@ func TestRoundtrip(t *testing.T) {
temp := t.TempDir()
dest := t.TempDir()
source := filepath.Join("testdata", "logs")
- repo1 := NewRepo(temp)
- repo2 := NewRepo(temp)
+ repo1 := NewRepo(temp, 8<<10)
+ repo2 := NewRepo(temp, 8<<10)
repo1.Commit(source)
// Commit a second version, just to see if it does not destroy everything
@@ -393,7 +393,7 @@ func TestHashes(t *testing.T) {
storeQueue := make(chan chunkData, 16)
storeEnd := make(chan bool)
- repo1 := NewRepo(source)
+ repo1 := NewRepo(source, 8<<10)
repo1.chunkReadWrapper = utils.NopReadWrapper
repo1.chunkWriteWrapper = utils.NopWriteWrapper
go repo1.loadChunks([]string{filepath.Join(source, "00000")}, chunks)
@@ -409,7 +409,7 @@ func TestHashes(t *testing.T) {
id: c.GetId(),
}
}
- repo2 := NewRepo(dest)
+ repo2 := NewRepo(dest, 8<<10)
repo2.chunkReadWrapper = utils.NopReadWrapper
repo2.chunkWriteWrapper = utils.NopWriteWrapper
os.MkdirAll(filepath.Join(dest, "00000", chunksName), 0775)