aboutsummaryrefslogtreecommitdiff
path: root/sketch
diff options
context:
space:
mode:
Diffstat (limited to 'sketch')
-rw-r--r--sketch/sketch.go85
-rw-r--r--sketch/sketch_test.go43
-rw-r--r--sketch/testdata/00000000000000085
-rw-r--r--sketch/testdata/00000000000001447
4 files changed, 260 insertions, 0 deletions
diff --git a/sketch/sketch.go b/sketch/sketch.go
new file mode 100644
index 0000000..12e62fa
--- /dev/null
+++ b/sketch/sketch.go
@@ -0,0 +1,85 @@
+package sketch
+
+import (
+ "bytes"
+ "encoding/binary"
+ "io"
+ "log"
+ "sync"
+
+ "github.com/chmduquesne/rollinghash/rabinkarp64"
+)
+
+type Sketch []uint64
+
+type ReadByteReader interface {
+ io.Reader
+ io.ByteReader
+}
+
+const fBytes = 8
+
+// SketchChunk produces a sketch for a chunk based on wSize: the window size,
+// sfCount: the number of super-features, and fCount: the number of feature
+// per super-feature
+func SketchChunk(r io.Reader, pol rabinkarp64.Pol, chunkSize int, wSize int, sfCount int, fCount int) (Sketch, error) {
+ var wg sync.WaitGroup
+ var fSize = FeatureSize(chunkSize, sfCount, fCount)
+ var chunk bytes.Buffer
+ superfeatures := make([]uint64, 0, sfCount)
+ features := make([]uint64, 0, fCount*sfCount)
+ sfBuff := make([]byte, fBytes*fCount)
+ chunkLen, err := chunk.ReadFrom(r)
+ if err != nil {
+ log.Panicln(chunkLen, err)
+ }
+ for f := 0; f < int(chunkLen)/fSize; f++ {
+ var fBuff bytes.Buffer
+ n, err := io.CopyN(&fBuff, &chunk, int64(fSize))
+ if err != nil {
+ log.Println(n, err)
+ continue
+ }
+ features = append(features, 0)
+ wg.Add(1)
+ go calcFeature(&wg, pol, &fBuff, wSize, fSize, &features[f])
+ }
+ hasher := rabinkarp64.NewFromPol(pol)
+ wg.Wait()
+ for sf := 0; sf < len(features)/fCount; sf++ {
+ for i := 0; i < fCount; i++ {
+ binary.LittleEndian.PutUint64(sfBuff[i*fBytes:(i+1)*fBytes], features[i+sf*fCount])
+ }
+ hasher.Reset()
+ hasher.Write(sfBuff)
+ superfeatures = append(superfeatures, hasher.Sum64())
+ }
+ return superfeatures, nil
+}
+
+func calcFeature(wg *sync.WaitGroup, p rabinkarp64.Pol, r ReadByteReader, wSize int, fSize int, result *uint64) {
+ defer wg.Done()
+ hasher := rabinkarp64.NewFromPol(p)
+ n, err := io.CopyN(hasher, r, int64(wSize))
+ if err != nil {
+ log.Println(n, err)
+ }
+ max := hasher.Sum64()
+ for w := 0; w < fSize-wSize; w++ {
+ b, _ := r.ReadByte()
+ hasher.Roll(b)
+ h := hasher.Sum64()
+ if h > max {
+ max = h
+ }
+ }
+ *result = max
+}
+
+func SuperFeatureSize(chunkSize int, sfCount int, fCount int) int {
+ return FeatureSize(chunkSize, sfCount, fCount) * sfCount
+}
+
+func FeatureSize(chunkSize int, sfCount int, fCount int) int {
+ return chunkSize / (sfCount * fCount)
+}
diff --git a/sketch/sketch_test.go b/sketch/sketch_test.go
new file mode 100644
index 0000000..df35514
--- /dev/null
+++ b/sketch/sketch_test.go
@@ -0,0 +1,43 @@
+package sketch
+
+import (
+ "os"
+ "path"
+ "reflect"
+ "testing"
+
+ "github.com/chmduquesne/rollinghash/rabinkarp64"
+)
+
+func TestSketchChunk(t *testing.T) {
+ var sketch, expected Sketch
+ var err error
+ dataDir := "testdata"
+ pol, err := rabinkarp64.RandomPolynomial(1)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ c0, err := os.Open(path.Join(dataDir, "000000000000000"))
+ if err != nil {
+ t.Fatal(err)
+ }
+ sketch, err = SketchChunk(c0, pol, 8<<10, 32, 3, 4)
+ if err != nil {
+ t.Fatal(err)
+ }
+ expected = Sketch{429857165471867, 6595034117354675, 8697818304802825}
+ if !reflect.DeepEqual(sketch, expected) {
+ t.Errorf("Sketch does not match, expected: %d, actual: %d", expected, sketch)
+ }
+
+ c14, err := os.Open(path.Join(dataDir, "000000000000014"))
+ sketch, err = SketchChunk(c14, pol, 8<<10, 32, 3, 4)
+ if err != nil {
+ t.Error(err)
+ }
+ expected = Sketch{658454504014104}
+ if !reflect.DeepEqual(sketch, expected) {
+ t.Errorf("Sketch does not match, expected: %d, actual: %d", expected, sketch)
+ }
+}
diff --git a/sketch/testdata/000000000000000 b/sketch/testdata/000000000000000
new file mode 100644
index 0000000..ac78620
--- /dev/null
+++ b/sketch/testdata/000000000000000
@@ -0,0 +1,85 @@
+2019-06-05T20:13 [INFO] testInfoString (LogTest.java:52)
+2019-06-05T20:13 [INFO][YO] testInfoString (LogTest.java:57)
+2019-06-05T20:13 [INFO] testInfoString (LogTest.java:52)
+2019-06-05T20:13 [INFO][YO] testInfoString (LogTest.java:57)
+2019-06-05T20:51 [INFO] testInfoString (LogTest.java:52)
+2019-06-05T20:51 [INFO][YO] testInfoString (LogTest.java:57)
+2019-06-05T22:41 [INFO] testInfoString (LogTest.java:52)
+2019-06-05T22:41 [INFO][YO] testInfoString (LogTest.java:57)
+2019-06-05T23:02 [INFO] testInfoString (LogTest.java:52)
+2019-06-05T23:02 [INFO][YO] testInfoString (LogTest.java:57)
+2019-06-05T20:13 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107)
+2019-06-05T20:13 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.52 millis (Loader.java:116)
+2019-06-05T20:13 [INFO][MEMORY] 160,128 (Loader.java:117)
+2019-06-05T20:13 [INFO] PARSE : FINAL USAGE (Loader.java:136)
+2019-06-05T20:13 [INFO][MEMORY] 231,367 (Loader.java:138)
+2019-06-05T20:13 [INFO][TIMER] Temps pris par le parsing: 0.898 s (LoaderTest.java:69)
+2019-06-05T20:13 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97)
+2019-06-05T20:14 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107)
+2019-06-05T20:14 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.46 millis (Loader.java:116)
+2019-06-05T20:14 [INFO][MEMORY] 267,348 (Loader.java:117)
+2019-06-05T20:14 [INFO] PARSE : FINAL USAGE (Loader.java:136)
+2019-06-05T20:14 [INFO][MEMORY] 231,359 (Loader.java:138)
+2019-06-05T20:14 [INFO][TIMER] Temps pris par le parsing: 0.839 s (LoaderTest.java:69)
+2019-06-05T20:14 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97)
+2019-06-05T20:51 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107)
+2019-06-05T20:51 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 2.61 millis (Loader.java:116)
+2019-06-05T20:51 [INFO][MEMORY] 189,561 (Loader.java:117)
+2019-06-05T20:51 [INFO] PARSE : FINAL USAGE (Loader.java:136)
+2019-06-05T20:51 [INFO][MEMORY] 168,686 (Loader.java:138)
+2019-06-05T20:51 [INFO][TIMER] Temps pris par le parsing: 0.930 s (LoaderTest.java:69)
+2019-06-05T20:51 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97)
+2019-06-05T22:41 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107)
+2019-06-05T22:41 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 1.83 millis (Loader.java:116)
+2019-06-05T22:41 [INFO][MEMORY] 189,494 (Loader.java:117)
+2019-06-05T22:41 [INFO] PARSE : FINAL USAGE (Loader.java:136)
+2019-06-05T22:41 [INFO][MEMORY] 168,619 (Loader.java:138)
+2019-06-05T22:41 [INFO][TIMER] Temps pris par le parsing: 0.882 s (LoaderTest.java:69)
+2019-06-05T22:41 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97)
+2019-06-05T23:02 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107)
+2019-06-05T23:02 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.54 millis (Loader.java:116)
+2019-06-05T23:02 [INFO][MEMORY] 189,551 (Loader.java:117)
+2019-06-05T23:02 [INFO] PARSE : FINAL USAGE (Loader.java:136)
+2019-06-05T23:02 [INFO][MEMORY] 168,676 (Loader.java:138)
+2019-06-05T23:02 [INFO][TIMER] Temps pris par le parsing: 1.04 s (LoaderTest.java:69)
+2019-06-05T23:02 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97)
+2019-06-10T20:34 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107)
+2019-06-10T20:34 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 4.82 millis (Loader.java:116)
+2019-06-10T20:34 [INFO][MEMORY] 82,958 (Loader.java:117)
+2019-06-10T20:34 [INFO] PARSE : FINAL USAGE (Loader.java:136)
+2019-06-10T20:34 [INFO][MEMORY] 163,350 (Loader.java:138)
+2019-06-10T20:34 [INFO][TIMER] Temps pris par le parsing: 0.938 s (LoaderTest.java:69)
+2019-06-10T20:34 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97)
+2019-06-10T20:50 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107)
+2019-06-10T20:50 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.56 millis (Loader.java:116)
+2019-06-10T20:50 [INFO][MEMORY] 82,957 (Loader.java:117)
+2019-06-10T20:50 [INFO] PARSE : FINAL USAGE (Loader.java:136)
+2019-06-10T20:50 [INFO][MEMORY] 163,349 (Loader.java:138)
+2019-06-10T20:50 [INFO][TIMER] Temps pris par le parsing: 0.853 s (LoaderTest.java:69)
+2019-06-10T20:50 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97)
+2019-06-10T20:51 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107)
+2019-06-10T20:51 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.51 millis (Loader.java:116)
+2019-06-10T20:51 [INFO][MEMORY] 189,538 (Loader.java:117)
+2019-06-10T20:51 [INFO] PARSE : FINAL USAGE (Loader.java:136)
+2019-06-10T20:51 [INFO][MEMORY] 168,663 (Loader.java:138)
+2019-06-10T20:51 [INFO][TIMER] Temps pris par le parsing: 1.27 s (LoaderTest.java:69)
+2019-06-10T20:51 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97)
+2019-06-10T21:24 [WARNING] Fail to save entry number 0: incorrect data (Loader.java:107)
+2019-06-10T21:24 [INFO] Loader : nombre de résultats (local) parsés = 0 temps écoulé = 0.58 millis (Loader.java:116)
+2019-06-10T21:24 [INFO][MEMORY] 189,539 (Loader.java:117)
+2019-06-10T21:24 [INFO] PARSE : FINAL USAGE (Loader.java:136)
+2019-06-10T21:24 [INFO][MEMORY] 168,664 (Loader.java:138)
+2019-06-10T21:24 [INFO][TIMER] Temps pris par le parsing: 1.86 s (LoaderTest.java:69)
+2019-06-10T21:24 [DEBUG][line][70] [2, Thu Apr 09 19:29:33 CEST 2015, Thu Apr 09 19:37:09 CEST 2015, 1, 0.83, -73.98651885986328, 40.76189422607422, 1, N, -73.97399139404297, 40.760414123535156, 1, 6.5, 1.0, 0.5, 1.66, 0.0, 0.3, 9.96] (LoaderTest.java:97)
+2019-06-05T14:47 [INFO] SerialStructure.loadStructure : OK ! (SerialStructure.java:71)
+2019-06-05T14:49 [INFO] PARSE : memusage init = (ParallelLoader.java:112)
+2019-06-05T14:49 [INFO][MEMORY] 229,457 (ParallelLoader.java:114)
+2019-06-05T14:49 [INFO] Loader : nombre de résultats (total) lus = 100000 temps écoulé = 0.522 sactiveThreadNb = 0 (ParallelLoader.java:150)
+2019-06-05T14:49 [ERROR] debugCount = 0 (ParallelLoader.java:203)
+2019-06-05T14:49 [WARNING] Fail to save entry number 0: incorrect data (ParallelLoaderRunnable.java:287)
+2019-06-05T14:49 [INFO] Parsing terminé !! temps écoulé = 2.78 s (ParallelLoader.java:217)
+2019-06-05T14:49 [INFO] PARSE : Nombre de lignes (local) = 100000 (ParallelLoader.java:218)
+2019-06-05T14:53 [INFO] SerialStructure.loadStructure : OK ! (SerialStructure.java:71)
+2019-06-05T14:53 [INFO] PARSE : memusage init = (ParallelLoader.java:112)
+2019-06-05T14:53 [INFO][MEMORY] 227,095 (ParallelLoader.java:114)
+2019-06-05T14:53 [INFO] Loader : nombre de résulta \ No newline at end of file
diff --git a/sketch/testdata/000000000000014 b/sketch/testdata/000000000000014
new file mode 100644
index 0000000..ab7db22
--- /dev/null
+++ b/sketch/testdata/000000000000014
@@ -0,0 +1,47 @@
+shAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@659a969b (Table.java:338)
+2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 157,634 (IndexTreeDic.java:517)
+2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@76908cc0 (Table.java:338)
+2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 157,662 (IndexTreeDic.java:517)
+2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@2473d930 (Table.java:338)
+2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 158,176 (IndexTreeDic.java:517)
+2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@35047d03 (Table.java:338)
+2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 158,176 (IndexTreeDic.java:517)
+2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@49b0b76 (Table.java:338)
+2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 158,176 (IndexTreeDic.java:517)
+2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@769f71a9 (Table.java:338)
+2019-06-10T21:24 [INFO][MEMORY] IndexTreeDic.flushOnDisk 158,176 (IndexTreeDic.java:517)
+2019-06-10T21:24 [INFO] Table.flushAllIndexOnDisk : flush de l'arbre !index.indexTree.IndexTreeDic@4c9f8c13 (Table.java:338)
+2019-06-10T21:24 [ERROR] SerialStructure.writeStructure : impossible de sauvegarder la structure du disque. (SerialStructure.java:56)
+2019-06-10T21:24 [ERROR] java.nio.HeapByteBuffer (SerialStructure.java:57)
+2019-06-10T21:24 [ERROR] SerialStructure.loadStructure : impossible de charger la structure du disque. (SerialStructure.java:73)
+2019-06-10T21:24 [ERROR] null (SerialStructure.java:74)
+2019-06-10T21:24 [INFO] Lancé (IndexTreeMessyTest.java:366)
+2019-06-10T21:24 [INFO][TIMER] tempsPrisPourRecherchesSurFichiers: 37 millis (IndexTreeDic.java:1164)
+2019-06-10T21:24 [INFO][TIMER] Temps total recherche: 0.044 s (IndexTreeMessyTest.java:408)
+2019-06-10T21:24 [INFO][TIMER] Temps parcours des résultats: 0.59 millis (IndexTreeMessyTest.java:416)
+2019-06-10T21:24 [INFO] Nombre de résultats = 3116 (IndexTreeMessyTest.java:417)
+2019-06-10T21:24 [INFO][TIMER] tempsPrisPourRecherchesSurFichiers: 6 millis (IndexTreeDic.java:1164)
+2019-06-10T21:24 [INFO][TIMER] Temps total recherche: 7.06 millis (IndexTreeMessyTest.java:425)
+2019-06-10T21:24 [INFO][TIMER] 1Temps d'acquisition des résultats (chargement du disque de tous les champs): 0.071 s (IndexTreeMessyTest.java:439)
+2019-06-10T21:24 [INFO][TIMER] 2Temps d'acquisition des résultats certains champs seulement: 0.049 s (IndexTreeMessyTest.java:449)
+2019-06-10T21:24 [INFO] Nombre de résultats = 700 (IndexTreeMessyTest.java:450)
+2019-06-10T21:24 [INFO][TIMER] tempsPrisPourRecherchesSurFichiers: 8 millis (IndexTreeDic.java:1164)
+2019-06-10T21:24 [INFO] Nombre de résultats (pour 18 exact) = 116 (IndexTreeMessyTest.java:456)
+2019-06-10T21:24 [INFO] 40 (HashMapTests.java:27)
+2019-06-10T21:24 [INFO][MEMORY] Mem usage début - 88,272 (ParallelLoaderTest.java:56)
+2019-06-10T21:24 [INFO] Parsing de csvName = testdata/SMALL_100_000_yellow_tripdata_2015-04.csv (ParallelLoaderTest.java:65)
+2019-06-10T21:24 [INFO] PARSE : memusage init = (ParallelLoader.java:112)
+2019-06-10T21:24 [INFO][MEMORY] 90,320 (ParallelLoader.java:114)
+2019-06-10T21:24 [INFO] Loader : nombre de résultats (total) lus = 300000 temps écoulé = 0.046 sactiveThreadNb = 0 (ParallelLoader.java:150)
+2019-06-10T21:24 [WARNING] Fail to save entry number 0: incorrect data (ParallelLoaderRunnable.java:287)
+2019-06-10T21:24 [INFO] Parsing terminé !! temps écoulé = 0.898 s (ParallelLoader.java:217)
+2019-06-10T21:24 [INFO] PARSE : Nombre de lignes (local) = 100000 (ParallelLoader.java:218)
+2019-06-10T21:24 [INFO] Parsing de csvName = testdata/SMALL_100_000_yellow_tripdata_2015-04.csv (ParallelLoaderTest.java:65)
+2019-06-10T21:24 [INFO] PARSE : memusage init = (ParallelLoader.java:112)
+2019-06-10T21:24 [INFO][MEMORY] 232,213 (ParallelLoader.java:114)
+2019-06-10T21:24 [INFO] Loader : nombre de résultats (total) lus = 400000 temps écoulé = 0.047 sactiveThreadNb = 0 (ParallelLoader.java:150)
+2019-06-10T21:24 [WARNING] Fail to save entry number 0: incorrect data (ParallelLoaderRunnable.java:287)
+2019-06-10T21:24 [INFO] Parsing terminé !! temps écoulé = 0.709 s (ParallelLoader.java:217)
+2019-06-10T21:24 [INFO] PARSE : Nombre de lignes (local) = 100000 (ParallelLoader.java:218)
+2019-06-10T21:24 [INFO][MEMORY] Mem usage fin - 189,539 (ParallelLoaderTest.java:114)
+2019-06-10T21:24 [INFO][TIMER] TEMPS TOTAL PRIS PAR TOUS LES PARSINGS: 1.68 s (ParallelLoaderTest.java:115)