diff options
-rw-r--r-- | TODO.md | 15 | ||||
-rw-r--r-- | docs/go.mod | 1 | ||||
-rw-r--r-- | docs/note-2021-10-07.md | 155 | ||||
-rw-r--r-- | exp/.gitignore | 1 | ||||
-rw-r--r-- | exp/go.mod | 1 | ||||
-rw-r--r-- | main.go | 10 | ||||
-rw-r--r-- | repo/repo.go | 16 | ||||
-rw-r--r-- | repo/repo_test.go | 22 |
8 files changed, 188 insertions, 33 deletions
@@ -5,13 +5,6 @@ priority 1 chunks if no similar chunk is found. Thus, it will need to be of `chunkSize` or less. Otherwise, it could not be possibly used for deduplication. -- [ ] read individual files -- [ ] properly store information to be DNA encoded - - [ ] tar source to keep files metadata ? - - [x] store chunks compressed - - [x] compress before storing - - [x] decompress before loading - - [ ] store compressed chunks into tracks of `trackSize` (1024o) - [x] add chunk cache to uniquely store chunks in RAM - [x] better tests for `(*Repo).Commit` - [x] remove errored files from `fileList` @@ -20,15 +13,15 @@ priority 1 - [ ] add version blocks. - [x] command line with subcommands (like, hmm... git ? for instance). - experiences: - - [ ] compare against UDF + - [ ] compare against UDF (this will not be possible, unless we use a real + CR-ROM) - [ ] make multiple repo versions with multiple parameters - smaller block size priority 2 ---------- -- [ ] use more the `Reader` API (which is analogous to the `IOStream` in Java) +- [ ] read individual files - [ ] refactor `matchStream` as right now it is quite complex -- [x] better test for `(*Repo).matchStream` - [ ] tail packing of `PartialChunks` (this Struct does not exist yet as it is in fact just `TempChunks` for now). This might not be useful if we store the recipe incrementally. @@ -68,7 +61,7 @@ reunion 7/09 - [x] store file list incrementally. - [x] compress recipe - [x] compress file list -- [ ] make size comparison between recipe and chunks with some datasets +- [x] make size comparison between recipe and chunks with some datasets ideas ----- diff --git a/docs/go.mod b/docs/go.mod new file mode 100644 index 0000000..f4162d5 --- /dev/null +++ b/docs/go.mod @@ -0,0 +1 @@ +// empty go mod to make go cli skip this directory diff --git a/docs/note-2021-10-07.md b/docs/note-2021-10-07.md new file mode 100644 index 0000000..476964e --- /dev/null +++ b/docs/note-2021-10-07.md @@ -0,0 +1,155 @@ +Run exp on Multiple git repos +============================= + +Linux kernel +------------ + +``` +11:53:07.681 check out 224426f168aa4af3dcb628e6edaa824d32d60e6f +11:53:15.471 create diff for this version +11:53:30.163 create backup for this version +11:54:34.346 restore from diffs +11:54:38.244 check restore from diffs +11:54:38.880 restore from backup +11:54:43.776 check restore from backup +11:54:44.397 check out dbe79bbe9dcb22cb3651c46f18943477141ca452 +11:55:01.762 create diff for this version +11:56:19.545 create backup for this version +11:58:13.244 check out ceeee1fb2897651b434547eb26d93e6d2ff5a1a5 +11:58:27.085 create diff for this version +12:00:14.362 create backup for this version +12:01:09.252 check out f35723ec48ca60f2f3493ea40d63a9bc5b585c28 +12:01:22.871 create diff for this version +12:01:26.590 create backup for this version +12:02:08.992 check out d2cb1a95c5fa4d1691c90a4f530955b4ea3cfa24 +12:02:22.320 create diff for this version +12:02:34.623 create backup for this version +12:03:28.304 restore from diffs +12:03:52.677 check restore from diffs +12:03:53.166 restore from backup +12:03:57.937 check restore from backup +12:03:58.427 check out 03d782524e2d0511317769521c8d5daadbab8482 +12:04:12.099 create diff for this version +12:04:19.500 create backup for this version +12:05:01.339 check out 1cbf4c563c0eaaf11c552a88b374e213181c6ddd +12:05:13.805 create diff for this version +12:05:18.433 create backup for this version +12:06:13.891 check out da28438cae9a271c5c232177f81dfb243de9b7fa +12:06:26.596 create diff for this version +12:06:32.695 create backup for this version +12:07:10.455 check out d4b9ba7bf6f38cff55b5d95a0db7dd91311ce20a +12:07:23.631 create diff for this version +12:07:23.663 create backup for this version +12:07:43.166 restore from diffs +12:08:15.771 check restore from diffs +12:08:16.253 restore from backup +12:08:21.572 check restore from backup +12:08:22.035 check out 367636772f094fd840d2d79e75257bcfaa28e70f +12:08:32.149 create diff for this version +12:08:32.270 create backup for this version +12:08:55.599 check out f50f3ac51983025405a71b70b033cc6bcb0d1fc1 +12:09:05.597 create diff for this version +12:09:05.623 create backup for this version +12:09:28.613 check out b59a9504cb93db7fae31e60760725d48652a1fc3 +12:09:40.484 create diff for this version +12:09:43.885 create backup for this version +12:10:19.089 check out 69903d6500c73af8329a5fba7153b0d50748981c +12:10:19.245 create diff for this version +12:10:19.266 create backup for this version +12:10:35.016 restore from diffs +12:11:07.791 check restore from diffs +12:11:08.286 restore from backup +12:11:14.264 check restore from backup +12:11:14.752 check out c264852726dde251a0c09ec22f61a9be8b0db68b +12:11:14.835 create diff for this version +12:11:14.855 create backup for this version +12:11:42.260 check out 0964ce24d091a1d3dc7f667e1b107ab77d4325e6 +12:11:42.340 create diff for this version +12:11:42.361 create backup for this version +12:12:09.532 check out b188ffe876382ecc009ceb4fe033fd6ec7ba4ede +12:12:09.614 create diff for this version +12:12:09.635 create backup for this version +12:12:25.965 check out 14f8351a313f364afbc565f1ddcd43f8cfdccf52 +12:12:37.653 create diff for this version +12:12:41.413 create backup for this version +12:13:06.960 restore from diffs +12:13:41.277 check restore from diffs +12:13:41.740 restore from backup +12:13:47.783 check restore from backup +12:13:48.247 check out 07a4e2da7dd3c9345f84b2552872f9d38c257451 +12:13:58.535 create diff for this version +12:13:58.860 create backup for this version +12:14:23.049 check out 3834c3f227725e2395840aed82342bda4ee9d379 +12:14:33.557 create diff for this version +12:14:33.582 create backup for this version +12:14:58.535 check out 9383292f179e1907e7e7ade539ac8fd3b65c1e97 +12:15:10.629 create diff for this version +12:15:14.131 create backup for this version +12:15:46.082 clean up repo +============== SUMMARY ============== +backup diffs +77288903 12926085 +108519248 104610 +43371572 10323162 +17821696 10052904 +40321219 1395 +11047234 20915481 +20219859 639 +11381646 13592417 +131258 368 +243287 231 +7919 238438 +7803303 510 +144214 59771518 +149747 9910136 +4322 87905291 +145265 75958096 +3189502 28806094 +233130 7398260 +6718 2607 +3721760 812 +============== TOTAL ============== +345751802 337909054 +``` + +DNA-backup +---------- + +``` +============== SUMMARY ============== +backup diffs +19280 1451 +3138 638 +2746 4737 +2919 7321 +5430 2665 +6093 1496 +2888 11129 +7577 6004 +5824 2418 +1177 2815 +8396 7551 +8479 5804 +9323 2449 +7234 3397 +7926 2143 +12121 2416 +12273 13301 +12909 3274 +29000 1764 +18634 2162 +24159 9681 +20158 4631 +24669 10147 +25806 4983 +18169 11488 +20882 867 +8063 1768 +4053 4373 +14906 9698 +15514 869 +10193 4188 +6140 23257 +============== TOTAL ============== +376079 170885 +``` diff --git a/exp/.gitignore b/exp/.gitignore index 8267a55..093b79b 100644 --- a/exp/.gitignore +++ b/exp/.gitignore @@ -1,5 +1,6 @@ * !.gitignore !exp.sh +!go.mod !Makefile !README.md diff --git a/exp/go.mod b/exp/go.mod new file mode 100644 index 0000000..f4162d5 --- /dev/null +++ b/exp/go.mod @@ -0,0 +1 @@ +// empty go mod to make go cli skip this directory @@ -22,8 +22,9 @@ const ( ) var ( - logLevel int - format string + logLevel int + chunkSize int + format string ) var commit = command{flag.NewFlagSet("commit", flag.ExitOnError), commitMain, @@ -51,6 +52,7 @@ func init() { // setup subcommands for _, s := range subcommands { s.Flag.IntVar(&logLevel, "v", 3, "log verbosity level (0-4)") + s.Flag.IntVar(&chunkSize, "c", 8<<10, "chunk size") } } @@ -85,7 +87,7 @@ func commitMain(args []string) error { } source := args[0] dest := args[1] - repo := repo.NewRepo(dest) + repo := repo.NewRepo(dest, chunkSize) repo.Commit(source) return nil } @@ -96,7 +98,7 @@ func restoreMain(args []string) error { } source := args[0] dest := args[1] - repo := repo.NewRepo(source) + repo := repo.NewRepo(source, chunkSize) repo.Restore(dest) return nil } diff --git a/repo/repo.go b/repo/repo.go index 678b091..27df54e 100644 --- a/repo/repo.go +++ b/repo/repo.go @@ -104,7 +104,7 @@ type File struct { Link string } -func NewRepo(path string) *Repo { +func NewRepo(path string, chunkSize int) *Repo { var err error path, err = filepath.Abs(path) if err != nil { @@ -121,7 +121,7 @@ func NewRepo(path string) *Repo { } return &Repo{ path: path, - chunkSize: 8 << 10, + chunkSize: chunkSize, sketchWSize: 32, sketchSfCount: 3, sketchFCount: 4, @@ -155,13 +155,9 @@ func (r *Repo) Commit(source string) { newChunkPath := filepath.Join(newPath, chunksName) os.Mkdir(newPath, 0775) // TODO: handle errors os.Mkdir(newChunkPath, 0775) // TODO: handle errors - logger.Info("listing files") files := listFiles(source) - logger.Info("loading previous hashes") r.loadHashes(versions) - logger.Info("loading previous file lists") r.loadFileLists(versions) - logger.Info("loading previous recipies") r.loadRecipes(versions) storeQueue := make(chan chunkData, 32) storeEnd := make(chan bool) @@ -183,7 +179,6 @@ func (r *Repo) Commit(source string) { func (r *Repo) Restore(destination string) { versions := r.loadVersions() - logger.Info("loading previous file lists") r.loadFileLists(versions) logger.Info("loading previous recipies") r.loadRecipes(versions) @@ -233,6 +228,7 @@ func (r *Repo) loadVersions() []string { } func listFiles(path string) []File { + logger.Infof("list files from %s", path) var files []File err := filepath.Walk(path, func(p string, i fs.FileInfo, err error) error { if err != nil { @@ -353,6 +349,7 @@ func storeDelta(prevRaw []byte, curr interface{}, dest string, differ delta.Diff if err = encoder.Encode(curr); err != nil { logger.Panic(err) } + logger.Infof("store before delta: %d", currBuff.Len()) file, err := os.Create(dest) if err != nil { logger.Panic(err) @@ -412,12 +409,14 @@ func loadDeltas(target interface{}, versions []string, patcher delta.Patcher, wr // storeFileList stores the given list in the repo dir as a delta against the // previous version's one. func (r *Repo) storeFileList(version int, list []File) { + logger.Info("store files") dest := filepath.Join(r.path, fmt.Sprintf(versionFmt, version), filesName) storeDelta(r.filesRaw, list, dest, r.differ, r.chunkWriteWrapper) } // loadFileLists loads incrementally the file lists' delta of each given version. func (r *Repo) loadFileLists(versions []string) { + logger.Info("load previous file lists") var files []File r.filesRaw = loadDeltas(&files, versions, r.patcher, r.chunkReadWrapper, filesName) r.files = files @@ -516,6 +515,7 @@ func (r *Repo) loadChunks(versions []string, chunks chan<- IdentifiedChunk) { // loadHashes loads and aggregates the hashes stored for each given version and // stores them in the repo maps. func (r *Repo) loadHashes(versions []string) { + logger.Info("load previous hashes") for i, v := range versions { path := filepath.Join(v, hashesName) file, err := os.Open(path) @@ -740,11 +740,13 @@ func (r *Repo) restoreStream(stream io.WriteCloser, recipe []Chunk) { } func (r *Repo) storeRecipe(version int, recipe []Chunk) { + logger.Info("store recipe") dest := filepath.Join(r.path, fmt.Sprintf(versionFmt, version), recipeName) storeDelta(r.recipeRaw, recipe, dest, r.differ, r.chunkWriteWrapper) } func (r *Repo) loadRecipes(versions []string) { + logger.Info("load previous recipies") var recipe []Chunk r.recipeRaw = loadDeltas(&recipe, versions, r.patcher, r.chunkReadWrapper, recipeName) for _, c := range recipe { diff --git a/repo/repo_test.go b/repo/repo_test.go index 36cc1bd..e3a49c7 100644 --- a/repo/repo_test.go +++ b/repo/repo_test.go @@ -157,7 +157,7 @@ func (r *Repo) makeSketch(id *ChunkId, reader io.Reader, wg *sync.WaitGroup, ret func TestReadFiles1(t *testing.T) { tmpDir := t.TempDir() - repo := NewRepo(tmpDir) + repo := NewRepo(tmpDir, 8<<10) chunkCount := 590/repo.chunkSize + 1 dataDir := filepath.Join("testdata", "logs", "1") files := []string{"logTest.log"} @@ -166,7 +166,7 @@ func TestReadFiles1(t *testing.T) { func TestReadFiles2(t *testing.T) { tmpDir := t.TempDir() - repo := NewRepo(tmpDir) + repo := NewRepo(tmpDir, 8<<10) chunkCount := 22899/repo.chunkSize + 1 dataDir := filepath.Join("testdata", "logs", "2") files := []string{"csvParserTest.log", "slipdb.log"} @@ -175,7 +175,7 @@ func TestReadFiles2(t *testing.T) { func TestReadFiles3(t *testing.T) { tmpDir := t.TempDir() - repo := NewRepo(tmpDir) + repo := NewRepo(tmpDir, 8<<10) chunkCount := 119398/repo.chunkSize + 1 dataDir := filepath.Join("testdata", "logs") files := []string{ @@ -234,7 +234,7 @@ func TestSymlinks(t *testing.T) { func TestLoadChunks(t *testing.T) { resultDir := t.TempDir() dataDir := filepath.Join("testdata", "logs") - repo := NewRepo(resultDir) + repo := NewRepo(resultDir, 8<<10) repo.chunkReadWrapper = utils.NopReadWrapper repo.chunkWriteWrapper = utils.NopWriteWrapper resultVersion := filepath.Join(resultDir, "00000") @@ -291,7 +291,7 @@ func TestBsdiff(t *testing.T) { logger.SetLevel(3) defer logger.SetLevel(4) resultDir := t.TempDir() - repo := NewRepo(resultDir) + repo := NewRepo(resultDir, 8<<10) dataDir := filepath.Join("testdata", "logs") addedFile1 := filepath.Join(dataDir, "2", "slogTest.log") addedFile2 := filepath.Join(dataDir, "3", "slogTest.log") @@ -341,7 +341,7 @@ func TestCommitZlib(t *testing.T) { dest := t.TempDir() source := filepath.Join("testdata", "logs") expected := filepath.Join("testdata", "repo_8k_zlib") - repo := NewRepo(dest) + repo := NewRepo(dest, 8<<10) repo.patcher = delta.Fdelta{} repo.differ = delta.Fdelta{} repo.chunkReadWrapper = utils.ZlibReader @@ -357,7 +357,7 @@ func TestRestoreZlib(t *testing.T) { dest := t.TempDir() source := filepath.Join("testdata", "repo_8k_zlib") expected := filepath.Join("testdata", "logs") - repo := NewRepo(source) + repo := NewRepo(source, 8<<10) repo.patcher = delta.Fdelta{} repo.differ = delta.Fdelta{} repo.chunkReadWrapper = utils.ZlibReader @@ -373,8 +373,8 @@ func TestRoundtrip(t *testing.T) { temp := t.TempDir() dest := t.TempDir() source := filepath.Join("testdata", "logs") - repo1 := NewRepo(temp) - repo2 := NewRepo(temp) + repo1 := NewRepo(temp, 8<<10) + repo2 := NewRepo(temp, 8<<10) repo1.Commit(source) // Commit a second version, just to see if it does not destroy everything @@ -393,7 +393,7 @@ func TestHashes(t *testing.T) { storeQueue := make(chan chunkData, 16) storeEnd := make(chan bool) - repo1 := NewRepo(source) + repo1 := NewRepo(source, 8<<10) repo1.chunkReadWrapper = utils.NopReadWrapper repo1.chunkWriteWrapper = utils.NopWriteWrapper go repo1.loadChunks([]string{filepath.Join(source, "00000")}, chunks) @@ -409,7 +409,7 @@ func TestHashes(t *testing.T) { id: c.GetId(), } } - repo2 := NewRepo(dest) + repo2 := NewRepo(dest, 8<<10) repo2.chunkReadWrapper = utils.NopReadWrapper repo2.chunkWriteWrapper = utils.NopWriteWrapper os.MkdirAll(filepath.Join(dest, "00000", chunksName), 0775) |