diff options
author | n-peugnet <n.peugnet@free.fr> | 2021-09-09 12:09:18 +0200 |
---|---|---|
committer | n-peugnet <n.peugnet@free.fr> | 2021-09-09 12:10:45 +0200 |
commit | 8a03c46bf24b5a1fa1d2080ac4f763532db01bbe (patch) | |
tree | 069554f3e1e3e235a22d13dbb4a4a555b2d6e0d6 /sketch | |
parent | f061a7031181ef53d034c46b696156c143451cce (diff) | |
download | dna-backup-8a03c46bf24b5a1fa1d2080ac4f763532db01bbe.tar.gz dna-backup-8a03c46bf24b5a1fa1d2080ac4f763532db01bbe.zip |
export sketch in its own package
so that tests can be cached and to make sure it is independant of
the rest of the code
also move tests in testdata as this folder is ignored by go test by default
Diffstat (limited to 'sketch')
-rw-r--r-- | sketch/sketch.go | 85 | ||||
-rw-r--r-- | sketch/sketch_test.go | 43 | ||||
-rw-r--r-- | sketch/testdata/000000000000000 | 85 | ||||
-rw-r--r-- | sketch/testdata/000000000000014 | 47 |
4 files changed, 260 insertions, 0 deletions
diff --git a/sketch/sketch.go b/sketch/sketch.go new file mode 100644 index 0000000..12e62fa --- /dev/null +++ b/sketch/sketch.go @@ -0,0 +1,85 @@ +package sketch + +import ( + "bytes" + "encoding/binary" + "io" + "log" + "sync" + + "github.com/chmduquesne/rollinghash/rabinkarp64" +) + +type Sketch []uint64 + +type ReadByteReader interface { + io.Reader + io.ByteReader +} + +const fBytes = 8 + +// SketchChunk produces a sketch for a chunk based on wSize: the window size, +// sfCount: the number of super-features, and fCount: the number of feature +// per super-feature +func SketchChunk(r io.Reader, pol rabinkarp64.Pol, chunkSize int, wSize int, sfCount int, fCount int) (Sketch, error) { + var wg sync.WaitGroup + var fSize = FeatureSize(chunkSize, sfCount, fCount) + var chunk bytes.Buffer + superfeatures := make([]uint64, 0, sfCount) + features := make([]uint64, 0, fCount*sfCount) + sfBuff := make([]byte, fBytes*fCount) + chunkLen, err := chunk.ReadFrom(r) + if err != nil { + log.Panicln(chunkLen, err) + } + for f := 0; f < int(chunkLen)/fSize; f++ { + var fBuff bytes.Buffer + n, err := io.CopyN(&fBuff, &chunk, int64(fSize)) + if err != nil { + log.Println(n, err) + continue + } + features = append(features, 0) + wg.Add(1) + go calcFeature(&wg, pol, &fBuff, wSize, fSize, &features[f]) + } + hasher := rabinkarp64.NewFromPol(pol) + wg.Wait() + for sf := 0; sf < len(features)/fCount; sf++ { + for i := 0; i < fCount; i++ { + binary.LittleEndian.PutUint64(sfBuff[i*fBytes:(i+1)*fBytes], features[i+sf*fCount]) + } + hasher.Reset() + hasher.Write(sfBuff) + superfeatures = append(superfeatures, hasher.Sum64()) + } + return superfeatures, nil +} + +func calcFeature(wg *sync.WaitGroup, p rabinkarp64.Pol, r ReadByteReader, wSize int, fSize int, result *uint64) { + defer wg.Done() + hasher := rabinkarp64.NewFromPol(p) + n, err := io.CopyN(hasher, r, int64(wSize)) + if err != nil { + log.Println(n, err) + } + max := hasher.Sum64() + for w := 0; w < fSize-wSize; w++ { + b, _ := r.ReadByte() + hasher.Roll(b) + h := hasher.Sum64() + if h > max { + max = h + } + } + *result = max +} + +func SuperFeatureSize(chunkSize int, sfCount int, fCount int) int { + return FeatureSize(chunkSize, sfCount, fCount) * sfCount +} + +func FeatureSize(chunkSize int, sfCount int, fCount int) int { + return chunkSize / (sfCount * fCount) +} diff --git a/sketch/sketch_test.go b/sketch/sketch_test.go new file mode 100644 index 0000000..df35514 --- /dev/null +++ b/sketch/sketch_test.go @@ -0,0 +1,43 @@ +package sketch + +import ( + "os" + "path" + "reflect" + "testing" + + "github.com/chmduquesne/rollinghash/rabinkarp64" +) + +func TestSketchChunk(t *testing.T) { + var sketch, expected Sketch + var err error + dataDir := "testdata" + pol, err := rabinkarp64.RandomPolynomial(1) + if err != nil { + t.Fatal(err) + } + + c0, err := os.Open(path.Join(dataDir, "000000000000000")) + if err != nil { + t.Fatal(err) + } + sketch, err = SketchChunk(c0, pol, 8<<10, 32, 3, 4) + if err != nil { + t.Fatal(err) + } + expected = Sketch{429857165471867, 6595034117354675, 8697818304802825} + if !reflect.DeepEqual(sketch, expected) { + t.Errorf("Sketch does not match, expected: %d, actual: %d", expected, sketch) + } + + c14, err := os.Open(path.Join(dataDir, "000000000000014")) + sketch, err = SketchChunk(c14, pol, 8<<10, 32, 3, 4) + if err != nil { + t.Error(err) + } + expected = Sketch{658454504014104} + if !reflect.DeepEqual(sketch, expected) { + t.Errorf("Sketch does not match, expected: %d, actual: %d", expected, sketch) + } +} diff --git a/sketch/testdata/000000000000000 b/sketch/testdata/000000000000000 new file mode 100644 index 0000000..ac78620 --- /dev/null +++ b/sketch/testdata/000000000000000 @@ -0,0 +1,85 @@ +2019-06-05T20:13 [INFO] testInfoString (LogTest.java:52) +2019-06-05T20:13 [INFO][YO] testInfoString (LogTest.java:57) +2019-06-05T20:13 [INFO] testInfoString (LogTest.java:52) +2019-06-05T20:13 [INFO][YO] testInfoString (LogTest.java:57) +2019-06-05T20:51 [INFO] testInfoString (LogTest.java:52) +2019-06-05T20:51 [INFO][YO] testInfoString (LogTest.java:57) +2019-06-05T22:41 [INFO] testInfoString (LogTest.java:52) +2019-06-05T22:41 [INFO][YO] testInfoString (LogTest.java:57) +2019-06-05T23:02 [INFO] testInfoString (LogTest.java:52) +2019-06-05T23:02 [INFO][YO] testInfoString (LogTest.java:57) +2019-06-05T20:13 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-05T20:13 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.52 millis (Loader.java:116) +2019-06-05T20:13 [INFO][MEMORY] 160,128 (Loader.java:117) +2019-06-05T20:13 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-05T20:13 [INFO][MEMORY] 231,367 (Loader.java:138) +2019-06-05T20:13 [INFO][TIMER] Temps pris par le parsing: 0.898 s (LoaderTest.java:69) +2019-06-05T20:13 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-05T20:14 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-05T20:14 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.46 millis (Loader.java:116) +2019-06-05T20:14 [INFO][MEMORY] 267,348 (Loader.java:117) +2019-06-05T20:14 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-05T20:14 [INFO][MEMORY] 231,359 (Loader.java:138) +2019-06-05T20:14 [INFO][TIMER] Temps pris par le parsing: 0.839 s (LoaderTest.java:69) +2019-06-05T20:14 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-05T20:51 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-05T20:51 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 2.61 millis (Loader.java:116) +2019-06-05T20:51 [INFO][MEMORY] 189,561 (Loader.java:117) +2019-06-05T20:51 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-05T20:51 [INFO][MEMORY] 168,686 (Loader.java:138) +2019-06-05T20:51 [INFO][TIMER] Temps pris par le parsing: 0.930 s (LoaderTest.java:69) +2019-06-05T20:51 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-05T22:41 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-05T22:41 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 1.83 millis (Loader.java:116) +2019-06-05T22:41 [INFO][MEMORY] 189,494 (Loader.java:117) +2019-06-05T22:41 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-05T22:41 [INFO][MEMORY] 168,619 (Loader.java:138) +2019-06-05T22:41 [INFO][TIMER] Temps pris par le parsing: 0.882 s (LoaderTest.java:69) +2019-06-05T22:41 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-05T23:02 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-05T23:02 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.54 millis (Loader.java:116) +2019-06-05T23:02 [INFO][MEMORY] 189,551 (Loader.java:117) +2019-06-05T23:02 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-05T23:02 [INFO][MEMORY] 168,676 (Loader.java:138) +2019-06-05T23:02 [INFO][TIMER] Temps pris par le parsing: 1.04 s (LoaderTest.java:69) +2019-06-05T23:02 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-10T20:34 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-10T20:34 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 4.82 millis (Loader.java:116) +2019-06-10T20:34 [INFO][MEMORY] 82,958 (Loader.java:117) +2019-06-10T20:34 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-10T20:34 [INFO][MEMORY] 163,350 (Loader.java:138) +2019-06-10T20:34 [INFO][TIMER] Temps pris par le parsing: 0.938 s (LoaderTest.java:69) +2019-06-10T20:34 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-10T20:50 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-10T20:50 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.56 millis (Loader.java:116) +2019-06-10T20:50 [INFO][MEMORY] 82,957 (Loader.java:117) +2019-06-10T20:50 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-10T20:50 [INFO][MEMORY] 163,349 (Loader.java:138) +2019-06-10T20:50 [INFO][TIMER] Temps pris par le parsing: 0.853 s (LoaderTest.java:69) +2019-06-10T20:50 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-10T20:51 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-10T20:51 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.51 millis (Loader.java:116) +2019-06-10T20:51 [INFO][MEMORY] 189,538 (Loader.java:117) +2019-06-10T20:51 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-10T20:51 [INFO][MEMORY] 168,663 (Loader.java:138) +2019-06-10T20:51 [INFO][TIMER] Temps pris par le parsing: 1.27 s (LoaderTest.java:69) +2019-06-10T20:51 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-10T21:24 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-10T21:24 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.58 millis (Loader.java:116) +2019-06-10T21:24 [INFO][MEMORY] 189,539 (Loader.java:117) +2019-06-10T21:24 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-10T21:24 [INFO][MEMORY] 168,664 (Loader.java:138) +2019-06-10T21:24 [INFO][TIMER] Temps pris par le parsing: 1.86 s (LoaderTest.java:69) +2019-06-10T21:24 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-05T14:47 [INFO] SerialStructure.loadStructure : OK ! (SerialStructure.java:71) +2019-06-05T14:49 [INFO] PARSE : memusage init = (ParallelLoader.java:112) +2019-06-05T14:49 [INFO][MEMORY] 229,457 (ParallelLoader.java:114) +2019-06-05T14:49 [INFO] Loader : nombre de résultats (total) lus = 100000 temps écoulé = 0.522 sactiveThreadNb = 0 (ParallelLoader.java:150) +2019-06-05T14:49 [ERROR] debugCount = 0 (ParallelLoader.java:203) +2019-06-05T14:49 [WARNING] Fail to save entry number 0: incorrect data (ParallelLoaderRunnable.java:287) +2019-06-05T14:49 [INFO] Parsing terminé !! temps écoulé = 2.78 s (ParallelLoader.java:217) +2019-06-05T14:49 [INFO] PARSE : Nombre de lignes (local) = 100000 (ParallelLoader.java:218) +2019-06-05T14:53 [INFO] SerialStructure.loadStructure : OK ! (SerialStructure.java:71) +2019-06-05T14:53 [INFO] PARSE : memusage init = (ParallelLoader.java:112) +2019-06-05T14:53 [INFO][MEMORY] 227,095 (ParallelLoader.java:114) +2019-06-05T14:53 [INFO] Loader : nombre de résulta
\ No newline at end of file diff --git a/sketch/testdata/000000000000014 b/sketch/testdata/000000000000014 new file mode 100644 index 0000000..ab7db22 --- /dev/null +++ b/sketch/testdata/000000000000014 @@ -0,0 +1,47 @@ +shAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@659a969b (Table.java:338) +2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 157,634 (IndexTreeDic.java:517) +2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@76908cc0 (Table.java:338) +2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 157,662 (IndexTreeDic.java:517) +2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@2473d930 (Table.java:338) +2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 158,176 (IndexTreeDic.java:517) +2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@35047d03 (Table.java:338) +2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 158,176 (IndexTreeDic.java:517) +2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@49b0b76 (Table.java:338) +2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 158,176 (IndexTreeDic.java:517) +2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@769f71a9 (Table.java:338) +2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 158,176 (IndexTreeDic.java:517) +2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@4c9f8c13 (Table.java:338) +2019-06-10T21:24 [ERROR] SerialStructure.writeStructure : impossible de sauvegarder la structure du disque. (SerialStructure.java:56) +2019-06-10T21:24 [ERROR] java.nio.HeapByteBuffer (SerialStructure.java:57) +2019-06-10T21:24 [ERROR] SerialStructure.loadStructure : impossible de charger la structure du disque. (SerialStructure.java:73) +2019-06-10T21:24 [ERROR] null (SerialStructure.java:74) +2019-06-10T21:24 [INFO] Lancé (IndexTreeMessyTest.java:366) +2019-06-10T21:24 [INFO][TIMER] tempsPrisPourRecherchesSurFichiers: 37 millis (IndexTreeDic.java:1164) +2019-06-10T21:24 [INFO][TIMER] Temps total recherche: 0.044 s (IndexTreeMessyTest.java:408) +2019-06-10T21:24 [INFO][TIMER] Temps parcours des résultats: 0.59 millis (IndexTreeMessyTest.java:416) +2019-06-10T21:24 [INFO] Nombre de résultats = 3116 (IndexTreeMessyTest.java:417) +2019-06-10T21:24 [INFO][TIMER] tempsPrisPourRecherchesSurFichiers: 6 millis (IndexTreeDic.java:1164) +2019-06-10T21:24 [INFO][TIMER] Temps total recherche: 7.06 millis (IndexTreeMessyTest.java:425) +2019-06-10T21:24 [INFO][TIMER] 1Temps d'acquisition des résultats (chargement du disque de tous les champs): 0.071 s (IndexTreeMessyTest.java:439) +2019-06-10T21:24 [INFO][TIMER] 2Temps d'acquisition des résultats certains champs seulement: 0.049 s (IndexTreeMessyTest.java:449) +2019-06-10T21:24 [INFO] Nombre de résultats = 700 (IndexTreeMessyTest.java:450) +2019-06-10T21:24 [INFO][TIMER] tempsPrisPourRecherchesSurFichiers: 8 millis (IndexTreeDic.java:1164) +2019-06-10T21:24 [INFO] Nombre de résultats (pour 18 exact) = 116 (IndexTreeMessyTest.java:456) +2019-06-10T21:24 [INFO] 40 (HashMapTests.java:27) +2019-06-10T21:24 [INFO][MEMORY] Mem usage début - 88,272 (ParallelLoaderTest.java:56) +2019-06-10T21:24 [INFO] Parsing de csvName = testdata/SMALL_100_000_yellow_tripdata_2015-04.csv (ParallelLoaderTest.java:65) +2019-06-10T21:24 [INFO] PARSE : memusage init = (ParallelLoader.java:112) +2019-06-10T21:24 [INFO][MEMORY] 90,320 (ParallelLoader.java:114) +2019-06-10T21:24 [INFO] Loader : nombre de résultats (total) lus = 300000 temps écoulé = 0.046 sactiveThreadNb = 0 (ParallelLoader.java:150) +2019-06-10T21:24 [WARNING] Fail to save entry number 0: incorrect data (ParallelLoaderRunnable.java:287) +2019-06-10T21:24 [INFO] Parsing terminé !! temps écoulé = 0.898 s (ParallelLoader.java:217) +2019-06-10T21:24 [INFO] PARSE : Nombre de lignes (local) = 100000 (ParallelLoader.java:218) +2019-06-10T21:24 [INFO] Parsing de csvName = testdata/SMALL_100_000_yellow_tripdata_2015-04.csv (ParallelLoaderTest.java:65) +2019-06-10T21:24 [INFO] PARSE : memusage init = (ParallelLoader.java:112) +2019-06-10T21:24 [INFO][MEMORY] 232,213 (ParallelLoader.java:114) +2019-06-10T21:24 [INFO] Loader : nombre de résultats (total) lus = 400000 temps écoulé = 0.047 sactiveThreadNb = 0 (ParallelLoader.java:150) +2019-06-10T21:24 [WARNING] Fail to save entry number 0: incorrect data (ParallelLoaderRunnable.java:287) +2019-06-10T21:24 [INFO] Parsing terminé !! temps écoulé = 0.709 s (ParallelLoader.java:217) +2019-06-10T21:24 [INFO] PARSE : Nombre de lignes (local) = 100000 (ParallelLoader.java:218) +2019-06-10T21:24 [INFO][MEMORY] Mem usage fin - 189,539 (ParallelLoaderTest.java:114) +2019-06-10T21:24 [INFO][TIMER] TEMPS TOTAL PRIS PAR TOUS LES PARSINGS: 1.68 s (ParallelLoaderTest.java:115) |