diff options
Diffstat (limited to 'sketch')
-rw-r--r-- | sketch/sketch.go | 85 | ||||
-rw-r--r-- | sketch/sketch_test.go | 43 | ||||
-rw-r--r-- | sketch/testdata/000000000000000 | 85 | ||||
-rw-r--r-- | sketch/testdata/000000000000014 | 47 |
4 files changed, 260 insertions, 0 deletions
diff --git a/sketch/sketch.go b/sketch/sketch.go new file mode 100644 index 0000000..12e62fa --- /dev/null +++ b/sketch/sketch.go @@ -0,0 +1,85 @@ +package sketch + +import ( + "bytes" + "encoding/binary" + "io" + "log" + "sync" + + "github.com/chmduquesne/rollinghash/rabinkarp64" +) + +type Sketch []uint64 + +type ReadByteReader interface { + io.Reader + io.ByteReader +} + +const fBytes = 8 + +// SketchChunk produces a sketch for a chunk based on wSize: the window size, +// sfCount: the number of super-features, and fCount: the number of feature +// per super-feature +func SketchChunk(r io.Reader, pol rabinkarp64.Pol, chunkSize int, wSize int, sfCount int, fCount int) (Sketch, error) { + var wg sync.WaitGroup + var fSize = FeatureSize(chunkSize, sfCount, fCount) + var chunk bytes.Buffer + superfeatures := make([]uint64, 0, sfCount) + features := make([]uint64, 0, fCount*sfCount) + sfBuff := make([]byte, fBytes*fCount) + chunkLen, err := chunk.ReadFrom(r) + if err != nil { + log.Panicln(chunkLen, err) + } + for f := 0; f < int(chunkLen)/fSize; f++ { + var fBuff bytes.Buffer + n, err := io.CopyN(&fBuff, &chunk, int64(fSize)) + if err != nil { + log.Println(n, err) + continue + } + features = append(features, 0) + wg.Add(1) + go calcFeature(&wg, pol, &fBuff, wSize, fSize, &features[f]) + } + hasher := rabinkarp64.NewFromPol(pol) + wg.Wait() + for sf := 0; sf < len(features)/fCount; sf++ { + for i := 0; i < fCount; i++ { + binary.LittleEndian.PutUint64(sfBuff[i*fBytes:(i+1)*fBytes], features[i+sf*fCount]) + } + hasher.Reset() + hasher.Write(sfBuff) + superfeatures = append(superfeatures, hasher.Sum64()) + } + return superfeatures, nil +} + +func calcFeature(wg *sync.WaitGroup, p rabinkarp64.Pol, r ReadByteReader, wSize int, fSize int, result *uint64) { + defer wg.Done() + hasher := rabinkarp64.NewFromPol(p) + n, err := io.CopyN(hasher, r, int64(wSize)) + if err != nil { + log.Println(n, err) + } + max := hasher.Sum64() + for w := 0; w < fSize-wSize; w++ { + b, _ := r.ReadByte() + hasher.Roll(b) + h := hasher.Sum64() + if h > max { + max = h + } + } + *result = max +} + +func SuperFeatureSize(chunkSize int, sfCount int, fCount int) int { + return FeatureSize(chunkSize, sfCount, fCount) * sfCount +} + +func FeatureSize(chunkSize int, sfCount int, fCount int) int { + return chunkSize / (sfCount * fCount) +} diff --git a/sketch/sketch_test.go b/sketch/sketch_test.go new file mode 100644 index 0000000..df35514 --- /dev/null +++ b/sketch/sketch_test.go @@ -0,0 +1,43 @@ +package sketch + +import ( + "os" + "path" + "reflect" + "testing" + + "github.com/chmduquesne/rollinghash/rabinkarp64" +) + +func TestSketchChunk(t *testing.T) { + var sketch, expected Sketch + var err error + dataDir := "testdata" + pol, err := rabinkarp64.RandomPolynomial(1) + if err != nil { + t.Fatal(err) + } + + c0, err := os.Open(path.Join(dataDir, "000000000000000")) + if err != nil { + t.Fatal(err) + } + sketch, err = SketchChunk(c0, pol, 8<<10, 32, 3, 4) + if err != nil { + t.Fatal(err) + } + expected = Sketch{429857165471867, 6595034117354675, 8697818304802825} + if !reflect.DeepEqual(sketch, expected) { + t.Errorf("Sketch does not match, expected: %d, actual: %d", expected, sketch) + } + + c14, err := os.Open(path.Join(dataDir, "000000000000014")) + sketch, err = SketchChunk(c14, pol, 8<<10, 32, 3, 4) + if err != nil { + t.Error(err) + } + expected = Sketch{658454504014104} + if !reflect.DeepEqual(sketch, expected) { + t.Errorf("Sketch does not match, expected: %d, actual: %d", expected, sketch) + } +} diff --git a/sketch/testdata/000000000000000 b/sketch/testdata/000000000000000 new file mode 100644 index 0000000..ac78620 --- /dev/null +++ b/sketch/testdata/000000000000000 @@ -0,0 +1,85 @@ +2019-06-05T20:13 [INFO] testInfoString (LogTest.java:52) +2019-06-05T20:13 [INFO][YO] testInfoString (LogTest.java:57) +2019-06-05T20:13 [INFO] testInfoString (LogTest.java:52) +2019-06-05T20:13 [INFO][YO] testInfoString (LogTest.java:57) +2019-06-05T20:51 [INFO] testInfoString (LogTest.java:52) +2019-06-05T20:51 [INFO][YO] testInfoString (LogTest.java:57) +2019-06-05T22:41 [INFO] testInfoString (LogTest.java:52) +2019-06-05T22:41 [INFO][YO] testInfoString (LogTest.java:57) +2019-06-05T23:02 [INFO] testInfoString (LogTest.java:52) +2019-06-05T23:02 [INFO][YO] testInfoString (LogTest.java:57) +2019-06-05T20:13 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-05T20:13 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.52 millis (Loader.java:116) +2019-06-05T20:13 [INFO][MEMORY] 160,128 (Loader.java:117) +2019-06-05T20:13 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-05T20:13 [INFO][MEMORY] 231,367 (Loader.java:138) +2019-06-05T20:13 [INFO][TIMER] Temps pris par le parsing: 0.898 s (LoaderTest.java:69) +2019-06-05T20:13 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-05T20:14 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-05T20:14 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.46 millis (Loader.java:116) +2019-06-05T20:14 [INFO][MEMORY] 267,348 (Loader.java:117) +2019-06-05T20:14 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-05T20:14 [INFO][MEMORY] 231,359 (Loader.java:138) +2019-06-05T20:14 [INFO][TIMER] Temps pris par le parsing: 0.839 s (LoaderTest.java:69) +2019-06-05T20:14 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-05T20:51 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-05T20:51 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 2.61 millis (Loader.java:116) +2019-06-05T20:51 [INFO][MEMORY] 189,561 (Loader.java:117) +2019-06-05T20:51 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-05T20:51 [INFO][MEMORY] 168,686 (Loader.java:138) +2019-06-05T20:51 [INFO][TIMER] Temps pris par le parsing: 0.930 s (LoaderTest.java:69) +2019-06-05T20:51 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-05T22:41 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-05T22:41 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 1.83 millis (Loader.java:116) +2019-06-05T22:41 [INFO][MEMORY] 189,494 (Loader.java:117) +2019-06-05T22:41 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-05T22:41 [INFO][MEMORY] 168,619 (Loader.java:138) +2019-06-05T22:41 [INFO][TIMER] Temps pris par le parsing: 0.882 s (LoaderTest.java:69) +2019-06-05T22:41 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-05T23:02 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-05T23:02 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.54 millis (Loader.java:116) +2019-06-05T23:02 [INFO][MEMORY] 189,551 (Loader.java:117) +2019-06-05T23:02 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-05T23:02 [INFO][MEMORY] 168,676 (Loader.java:138) +2019-06-05T23:02 [INFO][TIMER] Temps pris par le parsing: 1.04 s (LoaderTest.java:69) +2019-06-05T23:02 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-10T20:34 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-10T20:34 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 4.82 millis (Loader.java:116) +2019-06-10T20:34 [INFO][MEMORY] 82,958 (Loader.java:117) +2019-06-10T20:34 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-10T20:34 [INFO][MEMORY] 163,350 (Loader.java:138) +2019-06-10T20:34 [INFO][TIMER] Temps pris par le parsing: 0.938 s (LoaderTest.java:69) +2019-06-10T20:34 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-10T20:50 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-10T20:50 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.56 millis (Loader.java:116) +2019-06-10T20:50 [INFO][MEMORY] 82,957 (Loader.java:117) +2019-06-10T20:50 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-10T20:50 [INFO][MEMORY] 163,349 (Loader.java:138) +2019-06-10T20:50 [INFO][TIMER] Temps pris par le parsing: 0.853 s (LoaderTest.java:69) +2019-06-10T20:50 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-10T20:51 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-10T20:51 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.51 millis (Loader.java:116) +2019-06-10T20:51 [INFO][MEMORY] 189,538 (Loader.java:117) +2019-06-10T20:51 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-10T20:51 [INFO][MEMORY] 168,663 (Loader.java:138) +2019-06-10T20:51 [INFO][TIMER] Temps pris par le parsing: 1.27 s (LoaderTest.java:69) +2019-06-10T20:51 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-10T21:24 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107) +2019-06-10T21:24 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.58 millis (Loader.java:116) +2019-06-10T21:24 [INFO][MEMORY] 189,539 (Loader.java:117) +2019-06-10T21:24 [INFO] PARSE : FINAL USAGE (Loader.java:136) +2019-06-10T21:24 [INFO][MEMORY] 168,664 (Loader.java:138) +2019-06-10T21:24 [INFO][TIMER] Temps pris par le parsing: 1.86 s (LoaderTest.java:69) +2019-06-10T21:24 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97) +2019-06-05T14:47 [INFO] SerialStructure.loadStructure : OK ! (SerialStructure.java:71) +2019-06-05T14:49 [INFO] PARSE : memusage init = (ParallelLoader.java:112) +2019-06-05T14:49 [INFO][MEMORY] 229,457 (ParallelLoader.java:114) +2019-06-05T14:49 [INFO] Loader : nombre de résultats (total) lus = 100000 temps écoulé = 0.522 sactiveThreadNb = 0 (ParallelLoader.java:150) +2019-06-05T14:49 [ERROR] debugCount = 0 (ParallelLoader.java:203) +2019-06-05T14:49 [WARNING] Fail to save entry number 0: incorrect data (ParallelLoaderRunnable.java:287) +2019-06-05T14:49 [INFO] Parsing terminé !! temps écoulé = 2.78 s (ParallelLoader.java:217) +2019-06-05T14:49 [INFO] PARSE : Nombre de lignes (local) = 100000 (ParallelLoader.java:218) +2019-06-05T14:53 [INFO] SerialStructure.loadStructure : OK ! (SerialStructure.java:71) +2019-06-05T14:53 [INFO] PARSE : memusage init = (ParallelLoader.java:112) +2019-06-05T14:53 [INFO][MEMORY] 227,095 (ParallelLoader.java:114) +2019-06-05T14:53 [INFO] Loader : nombre de résulta
\ No newline at end of file diff --git a/sketch/testdata/000000000000014 b/sketch/testdata/000000000000014 new file mode 100644 index 0000000..ab7db22 --- /dev/null +++ b/sketch/testdata/000000000000014 @@ -0,0 +1,47 @@ +shAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@659a969b (Table.java:338) +2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 157,634 (IndexTreeDic.java:517) +2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@76908cc0 (Table.java:338) +2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 157,662 (IndexTreeDic.java:517) +2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@2473d930 (Table.java:338) +2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 158,176 (IndexTreeDic.java:517) +2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@35047d03 (Table.java:338) +2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 158,176 (IndexTreeDic.java:517) +2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@49b0b76 (Table.java:338) +2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 158,176 (IndexTreeDic.java:517) +2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@769f71a9 (Table.java:338) +2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 158,176 (IndexTreeDic.java:517) +2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@4c9f8c13 (Table.java:338) +2019-06-10T21:24 [ERROR] SerialStructure.writeStructure : impossible de sauvegarder la structure du disque. (SerialStructure.java:56) +2019-06-10T21:24 [ERROR] java.nio.HeapByteBuffer (SerialStructure.java:57) +2019-06-10T21:24 [ERROR] SerialStructure.loadStructure : impossible de charger la structure du disque. (SerialStructure.java:73) +2019-06-10T21:24 [ERROR] null (SerialStructure.java:74) +2019-06-10T21:24 [INFO] Lancé (IndexTreeMessyTest.java:366) +2019-06-10T21:24 [INFO][TIMER] tempsPrisPourRecherchesSurFichiers: 37 millis (IndexTreeDic.java:1164) +2019-06-10T21:24 [INFO][TIMER] Temps total recherche: 0.044 s (IndexTreeMessyTest.java:408) +2019-06-10T21:24 [INFO][TIMER] Temps parcours des résultats: 0.59 millis (IndexTreeMessyTest.java:416) +2019-06-10T21:24 [INFO] Nombre de résultats = 3116 (IndexTreeMessyTest.java:417) +2019-06-10T21:24 [INFO][TIMER] tempsPrisPourRecherchesSurFichiers: 6 millis (IndexTreeDic.java:1164) +2019-06-10T21:24 [INFO][TIMER] Temps total recherche: 7.06 millis (IndexTreeMessyTest.java:425) +2019-06-10T21:24 [INFO][TIMER] 1Temps d'acquisition des résultats (chargement du disque de tous les champs): 0.071 s (IndexTreeMessyTest.java:439) +2019-06-10T21:24 [INFO][TIMER] 2Temps d'acquisition des résultats certains champs seulement: 0.049 s (IndexTreeMessyTest.java:449) +2019-06-10T21:24 [INFO] Nombre de résultats = 700 (IndexTreeMessyTest.java:450) +2019-06-10T21:24 [INFO][TIMER] tempsPrisPourRecherchesSurFichiers: 8 millis (IndexTreeDic.java:1164) +2019-06-10T21:24 [INFO] Nombre de résultats (pour 18 exact) = 116 (IndexTreeMessyTest.java:456) +2019-06-10T21:24 [INFO] 40 (HashMapTests.java:27) +2019-06-10T21:24 [INFO][MEMORY] Mem usage début - 88,272 (ParallelLoaderTest.java:56) +2019-06-10T21:24 [INFO] Parsing de csvName = testdata/SMALL_100_000_yellow_tripdata_2015-04.csv (ParallelLoaderTest.java:65) +2019-06-10T21:24 [INFO] PARSE : memusage init = (ParallelLoader.java:112) +2019-06-10T21:24 [INFO][MEMORY] 90,320 (ParallelLoader.java:114) +2019-06-10T21:24 [INFO] Loader : nombre de résultats (total) lus = 300000 temps écoulé = 0.046 sactiveThreadNb = 0 (ParallelLoader.java:150) +2019-06-10T21:24 [WARNING] Fail to save entry number 0: incorrect data (ParallelLoaderRunnable.java:287) +2019-06-10T21:24 [INFO] Parsing terminé !! temps écoulé = 0.898 s (ParallelLoader.java:217) +2019-06-10T21:24 [INFO] PARSE : Nombre de lignes (local) = 100000 (ParallelLoader.java:218) +2019-06-10T21:24 [INFO] Parsing de csvName = testdata/SMALL_100_000_yellow_tripdata_2015-04.csv (ParallelLoaderTest.java:65) +2019-06-10T21:24 [INFO] PARSE : memusage init = (ParallelLoader.java:112) +2019-06-10T21:24 [INFO][MEMORY] 232,213 (ParallelLoader.java:114) +2019-06-10T21:24 [INFO] Loader : nombre de résultats (total) lus = 400000 temps écoulé = 0.047 sactiveThreadNb = 0 (ParallelLoader.java:150) +2019-06-10T21:24 [WARNING] Fail to save entry number 0: incorrect data (ParallelLoaderRunnable.java:287) +2019-06-10T21:24 [INFO] Parsing terminé !! temps écoulé = 0.709 s (ParallelLoader.java:217) +2019-06-10T21:24 [INFO] PARSE : Nombre de lignes (local) = 100000 (ParallelLoader.java:218) +2019-06-10T21:24 [INFO][MEMORY] Mem usage fin - 189,539 (ParallelLoaderTest.java:114) +2019-06-10T21:24 [INFO][TIMER] TEMPS TOTAL PRIS PAR TOUS LES PARSINGS: 1.68 s (ParallelLoaderTest.java:115) |