WIP: CSV local db API

fcf07e92 · Romain Loth · 93d0c79e · fcf07e92 · fcf07e92 · fcf07e92
Commit fcf07e92 authored Jun 23, 2017 by Romain Loth
8 changed files
--- a/LOCALDB/csv_indexation.php
+++ b/LOCALDB/csv_indexation.php
+<?php
+// parse_and_index_csv: index a subset of csv columns for search
+// --------------------------------------------------------------
+// returns the full csv array (the documents base)
+//     AND a list of postings (the search index)
+function parse_and_index_csv($filename, $typed_cols_to_index, $separator, $quotechar) {
+  // list of csv rows
+  $base = array();
+  // initialize our inverted index by values
+  $postings = array() ;
+  foreach($typed_cols_to_index as $nodetype => $cols) {
+    $postings[$nodetype] = array() ;
+    // echodump("parse_and_index_csv: typed cols", $cols);
+    for($i = 0; $i < count($cols) ; $i++) {
+      $colname = $cols[$i.""];
+      $postings[$nodetype][$colname] = array();
+    }
+  }
+  // we'll initialize colnum => colname map from first row
+  $colnames = array() ;
+  $rowid = 0;
+  if (($fh = fopen($filename, "r")) !== FALSE) {
+    // we assume first line is titles
+    $colnames = fgetcsv($fh, 20000, $separator, $quotechar);
+    // we slurp and index the entire CSV
+    while (($line_fields = fgetcsv($fh, 20000, $separator, $quotechar)) !== FALSE) {
+        // NB 2nd arg is max length of line
+        // we used here 2 * the longest we saw in the exemples
+        // (change accordingly to your use cases)
+        $num = count($line_fields);
+        // echo "<p> $num fields in line $rowid: <br /></p>\n";
+        // keep the row in "database"
+        $base[$rowid] = array();
+        for ($c=0; $c < $num; $c++) {
+          $colname = $colnames[$c];
+          // debug
+          // echo "==>/".$colname."/:" . $line_fields[$c] . "<br />\n";
+          // store row -> fields -> value
+          $base[$rowid][$colname] = $line_fields[$c];
+          // fill our search index if the type+col was asked in postings
+          foreach (['semantic', 'social'] as $swtype){
+            if (array_key_exists($swtype, $postings)) {
+              if (array_key_exists($colname, $postings[$swtype])) {
+                // basic tokenisation (TODO specify tokenisation delimiters etc.)
+                $tokens = preg_split("/\W/", $line_fields[$c]);
+                // for debug
+                // echo("indexing column:".$colname." under type:".$swtype.'<br>');
+                // var_dump($tokens);
+                foreach($tokens as $tok) {
+                  if (strlen($tok)) {
+                    // POSS : stopwords could be used here
+                    if (! array_key_exists($tok, $postings[$swtype][$colname])) {
+                      $postings[$swtype][$colname][$tok] = array();
+                    }
+                    // rowid is a pointer to the document
+                    array_push($postings[$swtype][$colname][$tok], $rowid);
+                  }
+                }
+              }
+            }
+          }
+        }
+      $rowid++;
+    }
+    fclose($fh);
+  }
+  return array($base, $postings);
+}
+?>
--- a/LOCALDB/info_div.php
+++ b/LOCALDB/info_div.php
 <?php
 // manage the dynamical additional information in the left panel.
+ini_set('display_errors',1);
+ini_set('display_startup_errors',1);
+error_reporting(-1);
-// ini_set('display_errors',1);
+// relative path to dirname "/line"
-// ini_set('display_startup_errors',1);
+$project_root = "../";
-// error_reporting(-1);
 // exemple call:
-// http://blabla/LOCALDB/info_div.php?type=semantic&bi=0&query=[%22Monte%20Carlo%22]&gexf=%22data/AXA/RiskV2PageRank1000.gexf%22&index=ISItermsAxa_2015
+// http://blabla/LOCALDB/info_div.php?type=semantic&bi=0&query=[%22Monte%20Carlo%22]&gexf=%22line/AXA/RiskV2PageRank1000.gexf%22&index=ISItermsAxa_2015
 include('parameters_details.php');
 $max_item_displayed=6;
-// echo('graphdb: '. $graphdb.'<br/>');
+if ($_GET['dbtype'] == "sql") {
+  $base = new PDO("sqlite:".$project_root.$graphdb);
-$base = new PDO("sqlite:../" .$graphdb);
+  include('default_div.php');
+}
-include('default_div.php');
+else {
+  // to index: the "searchable columns"
-/*
+  if (! array_key_exists('toindex', $_GET)) {
- * This function gets the first db name in the data folder
+    echo('<br> info_div.php (csv mode): please provide columns to index <br>');
- * IT'S NOT SCALABLE! (If you want to use several db's)
+  }
- */
+  else {
-function getDB ($directory)  {
+    $idxcolsbytype = json_decode($_GET['toindex']);
-    //$results = array();
-    $result = "";
+    echodump("columns to index",$idxcolsbytype);
-    $handler = opendir($directory);
-    while ($file = readdir($handler)) {
+    include('csv_indexation.php');
-      if ($file != "." && $file != ".."
-              &&
+    // DO THE INDEXATION
-        ((strpos($file,'.db~'))==false && (strpos($file,'.db'))==true )
+    // we use cache if memcached is present (and if we indexed the csv already)
-              ||
+    // $can_use_cache = False
-        ((strpos($file,'.sqlite~'))==false && (strpos($file,'.sqlite'))==true)
-      ) {
+    // £TODO use memcached or something to store a serialized version of csv_search_base
-            //$results[] = $file;
+    // + add all (sem+soc) columns for the index to work !!
-            $result = $file;
+    $csv_search_base = parse_and_index_csv($project_root.$graphdb, $idxcolsbytype, ";", '"');
-            break;
+    $base = $csv_search_base[0];
+    $postings = $csv_search_base[1];
+    echodump("postings", $postings);
+    echodump("base", $base);
+    // DO THE SEARCH
+    $searchcols = json_decode($_GET['searchin']);
+    // a - split the query
+    $qtokens = preg_split('/\W/', $_GET["query"]);
+    // b - compute tfidfs
+    $tfs_per_tok_and_doc = array();
+    $dfs_per_tok = array();
+    // for each token
+    for ($k=0 ; $k < count($qtokens) ; $k) {
+      $tok = $qtokens[$k];
+      $tfs_per_tok_and_doc[$tok] = array();
+      for ($l=0 ; $l < count($searchcols) ; $l++) {
+        $searchable = $postings[$_GET['type']][$searchcols[$l]];
+        echodump("searchable", $searchable);
+        //
+        // if (array_key_exists($tok, $searchable)) {
+        //   for ($m ; $m < count($searchable[$tok]) ; $m++) {
+        //     $doc_id = $searchable[$tok][$m];
+        //
+        //     // freq of token per doc
+        //     if (array_key_exists($doc_id, $tfs_per_tok_and_doc[$tok])) {
+        //       $tfs_per_tok_and_doc[$tok][$doc_id]++;
+        //     }
+        //     else {
+        //       $tfs_per_tok_and_doc[$tok][$doc_id] = 1;
+        //     }
+        //
+        //     // global doc freqs
+        //     if (array_key_exists($tok, $dfs_per_tok)) {
+        //       $dfs_per_tok[$tok]++;
+        //     }
+        //     else {
+        //       $dfs_per_tok[$tok] = 1;
+        //     }
+        //   }
+        // }
      }
+      // $qtokens[k];
    }
-    closedir($handler);
-    //return $results;
+    // c - score per doc
-    return $result;
+    //
+    // $nbdoc = count($base);
+    // for ($i=0; $i < $nbdoc; $i++) {
+    // }
+    // dfs = array();
+    // DISPLAY THE RESULTS
+    // function displayDoc($docId, $score, $base) {
+    //
+    //   // POSS score should have a data-score attribute
+    //   $output ="<li title='".$score."'>";
+    //
+    //   $output.="<p><b>".$base[$docId]['title']."</b></p>"
+    //   $output.="<p>".$base[$docId]['author']." [".$base[$docId]['pubdate']."], <i>(".$base[$docId]['journal'].")</i></p>";
+    //   $output.="<p>".$base[$docId]['keywords']."</p>";
+    //
+    //   $output.="</li>";
+    //
+    //   return $output
+    // }
+  }
 }
 ?>
--- a/LOCALDB/parameters_details.php
+++ b/LOCALDB/parameters_details.php
@@ -4,16 +4,20 @@
 # loading an associated db for a given gexf as relatedDocs php API
 $gexf_db = array();
-# $gexf_db["data/terrorism/terrorism_bi.gexf"] = "data/terrorism/data.db";
+# £££TODO should be passed by param
-# $gexf_db["data/ClimateChange/ClimateChangeV1.gexf"] = "data/ClimateChange/wosclimatechange-61715-1-wosclimatechange-db(2).db";
 $gexf_db["data/ClimateChange/Maps_S_800.gexf"] = "data/ClimateChange/wos_climate-change_title_2014-2015.db";
 $gexf_db["data/AXA/RiskV2PageRank1000.gexf"] = "data/AXA/data.db";
 $gexf_db["data/AXA/RiskV2PageRank2500.gexf"] = "data/AXA/data.db";
 $gexf_db["data/AXA/RiskV2PageRank5000.gexf"] = "data/AXA/data.db";
+$gexf_db["data/test/mini_for_csv.gexf"] = "data/test/mini_for_csv.tsv";
 // TESTS
+// for debug
+echo "<br>";
+var_dump($_GET);
+echo "<br>---<br>";
 // $gexf_db["data/ProgrammeDesCandidats.gexf"] = "foobar";
 $gexf= str_replace('"','',$_GET["gexf"]);
@@ -21,5 +25,12 @@ $gexf= str_replace('"','',$_GET["gexf"]);
 $mainpath=dirname(getcwd())."/";
 $graphdb = $gexf_db[$gexf];
+echodump("graphdb", $graphdb);
+function echodump($title, $anyObj) {
+  echo "<br>".$title.": ";
+  echo (json_encode($anyObj, JSON_PRETTY_PRINT));
+  echo "<br>";
+}
 ?>
--- a/data/test/mini_for_csv.gexf
+++ b/data/test/mini_for_csv.gexf
+<gexf xmlns="http://www.gexf.net/1.1draft" xmlns:viz="http://www.gephi.org/gexf/viz" version="1.1">
+  <graph defaultedgetype="undirected" type="static">
+    <attributes class="node" type="static">
+      <attribute id="0" title="category" type="string">  </attribute>
+    </attributes>
+    <nodes>
+      <node id="T::0" label="scalajs">
+        <viz:size value="1"/>
+        <attvalues>
+          <attvalue for="0" value="term"/>
+        </attvalues>
+      </node>
+      <node id="T::1" label="scala">
+        <viz:size value="1"/>
+        <attvalues>
+          <attvalue for="0" value="term"/>
+        </attvalues>
+      </node>
+      <node id="T::2" label="python">
+        <viz:size value="1"/>
+        <attvalues>
+          <attvalue for="0" value="term"/>
+        </attvalues>
+      </node>
+      <node id="T::3" label="javascript">
+        <viz:size value="1"/>
+        <attvalues>
+          <attvalue for="0" value="term"/>
+        </attvalues>
+      </node>
+      <node id="T::4" label="php">
+        <viz:size value="1"/>
+        <attvalues>
+          <attvalue for="0" value="term"/>
+        </attvalues>
+      </node>
+      <node id="P::0" label="-bb-">
+        <viz:size value="1"/>
+        <attvalues>
+          <attvalue for="0" value="person"/>
+        </attvalues>
+      </node>
+      <node id="P::1" label="-aa-">
+        <viz:size value="1"/>
+        <attvalues>
+          <attvalue for="0" value="person"/>
+        </attvalues>
+      </node>
+      <node id="P::2" label="-dd-">
+        <viz:size value="1"/>
+        <attvalues>
+          <attvalue for="0" value="person"/>
+        </attvalues>
+      </node>
+      <node id="P::3" label="-cc-">
+        <viz:size value="1"/>
+        <attvalues>
+          <attvalue for="0" value="person"/>
+        </attvalues>
+      </node>
+      </nodes>
+      <edges>
+      <!-- bipartite 1|1 -->
+      <edge id="1" source="T::1" target="P::1"></edge>
+      <edge id="2" source="T::1" target="P::2"></edge>
+      <edge id="3" source="T::2" target="P::3"></edge>
+      <edge id="4" source="T::3" target="P::2"></edge>
+      <edge id="5" source="T::3" target="P::3"></edge>
+      <edge id="6" source="T::4" target="P::0"></edge>
+      <edge id="7" source="T::4" target="P::3"></edge>
+      <!-- intra sem 1|0 -->
+      <edge id="8" source="T::2" target="T::3"></edge>
+      <edge id="9" source="T::3" target="T::4"></edge>
+      <edge id="10" source="T::0" target="T::1"></edge>
+      <edge id="10" source="T::0" target="T::3"></edge>
+      <!-- intra soc 0|1 -->
+      <edge id="10" source="P::0" target="P::1"></edge>
+      <edge id="11" source="P::1" target="P::2"></edge>
+      <edge id="12" source="P::2" target="P::3"></edge>
+     </edges>
+    </graph>
+   </gexf>
--- a/db.json
+++ b/db.json
 {
-	"data/politoscope": {
+  "data/test": {
+    "first" : "mini_for_csv.gexf",
+    "gexfs": {
+      "mini_for_csv.gexf": {
+        "_comment": "NB: underspecified for csv and for db.json !! so this is a prototype structure",
+        "_comment": "POSS: weighted columns for matching importance",
+        "dbtype": "csv",
+        "semantic":["title","keywords","text"],
+        "social":["author"],
+        "dbfile": "mini_for_csv.csv"
+      },
+      "test_with_various_atts.gexf": {}
+    }
+  },
+  "data/politoscope": {
 		"dbname":null,
 		"title":"Politoscope",
 		"date":"2017",
@@ -33,11 +47,5 @@
 		"gexfs": {
 			"graph_example.json": {}
 		}
-	},
-	"data/test": {
-		"first" : "test_with_various_atts.gexf",
-		"gexfs": {
-			"test_with_various_atts.gexf": {}
-		}
 	}
 }
--- a/extras_explorerjs.js
+++ b/extras_explorerjs.js
@@ -11,7 +11,6 @@ TW.gui.colorFuns = {
  'cluster': "clusterColoring"
 }
 // Execution:    changeGraphAppearanceByFacets( true )
 // It reads scanned node-attributes and prepared legends in TW.Clusters
 //  to add the button in the html with the sigmaUtils.gradientColoring(x) listener.
@@ -420,8 +419,12 @@ function topPapersFetcher(swType, qWords, priorHtml, cbNext){
  let stockErrMsg = `<p class="micromessage">
    Your settings for relatedDocsType are set on ${TW.conf.relatedDocsType}
-    API but it couldn't be connected to.<br >Check if it is running and
+    API but it couldn't be connected to.</p>`
+  if (TW.conf.relatedDocsType == "api") {
+    stockErrMsg += `<p class="micromessage">Check if it is running and
    accessible:<br><span class=code>${TW.conf.relatedDocsAPI}</span></p>`
+  }
  let resHTML = ''
@@ -453,7 +456,7 @@ function topPapersFetcher(swType, qWords, priorHtml, cbNext){
    });
  }
  else if (TW.conf.relatedDocsType == "wosLocalDB") {
-    let gexfinfos = TW.fields[TW.File]
+    let gexfinfos = TW.relDocsInfos[TW.File]
    if (!gexfinfos || !gexfinfos[swType]) {
      resHTML =
        `<p>Your settings for relatedDocsType are set on a local wos database,
@@ -464,15 +467,40 @@ function topPapersFetcher(swType, qWords, priorHtml, cbNext){
            return
    }
    else {
+      // /!\ documentation and specification needed for the php use cases /!\
      let joinedQ = JSON.stringify(qWords).split('&').join('__and__');
-      let querytable = gexfinfos[swType]
+      // cf. the php code for these url args:
-      let urlParams = "type="+swType+"&query="+joinedQ+"&gexf="+TW.File+"&index="+querytable+"&n="+TW.conf.relatedDocsMax
+      //   - type: the node type (social/semantic)
+      //   - dbtype: 'sql' (classic sqlite like wos)
+      //          or 'csv' (like gargantext exports)
+      // POSS object + join.map(join)
+      let urlParams = "type="+swType+"&query="+joinedQ+"&gexf="+TW.File+"&n="+TW.conf.relatedDocsMax+"&dbtype="+gexfinfos.dbtype
+      if (gexfinfos.dbtype == "sql") {
+        var qIndex = gexfinfos[swType]    // a table
+        urlParams += `&index=${qIndex}`
+      }
+      else {
+        // a list of csv columns to search in
+        // ex: for semantic nodes matching we look in 'title', 'keywords' cols
+        //     for social nodes matching we look in 'authors' col... etc.
+        let joinedSearchCols = JSON.stringify(gexfinfos[swType])
+        urlParams += `&searchin=${joinedSearchCols}`
+        let joinedAllCols = JSON.stringify(gexfinfos)
+        urlParams += `&toindex=${joinedAllCols}`
+        // POSS use a direct access from php to db.json to avoid toindex
+        // POSS make it a REST array like: index[]=title&index[]=keywords
+      }
      $.ajax({
          type: 'GET',
-          url: 'LOCALDB/info_div.php',
+          url: TW.conf.relatedDocsAPI + '/info_div.php',
          data: urlParams,
          success : function(data){
-              // console.log('relatedDocs: LOCALDB/info_div.php?'+ urlParams);
+              console.log(`relatedDocs: ${TW.conf.relatedDocsAPI}/info_div.php?${urlParams}`);
              resHTML = data
              cbNext(priorHtml + resHTML)
          },
@@ -481,6 +509,12 @@ function topPapersFetcher(swType, qWords, priorHtml, cbNext){
            cbNext(priorHtml + stockErrMsg)
          }
      });
    }
  }
 }

--- a/settings_explorerjs.js
+++ b/settings_explorerjs.js
@@ -36,11 +36,12 @@ TW.conf = (function(TW){
  TWConf.getRelatedDocs = true
  TWConf.relatedDocsMax = 10
-  TWConf.relatedDocsAPI = "http://127.0.0.1:5000/twitter_search"
-  TWConf.relatedDocsType = "wosLocalDB"      // accepted: "twitter" | "wosLocalDB"
+  TWConf.relatedDocsType = "wosLocalDB"   // accepted: "twitter" | "wosLocalDB"
                                          // POSSible: "elastic"
+  // TWConf.relatedDocsAPI = "http://127.0.0.1:5000/twitter_search"
+  TWConf.relatedDocsAPI = "LOCALDB"
  // =======================
  // DATA FACETS AND LEGENDS
  // =======================
@@ -147,8 +148,8 @@ TW.conf = (function(TW){
  // =============
  // Node typology: categories (resp. 0 and 1) will get these default labels
-  TWConf.catSem = "Terms";
+  TWConf.catSem = "term";
-  TWConf.catSoc = "Document";
+  TWConf.catSoc = "person";
  // NB: these labels may be superseded by the input data's node types values
  //     cf. sortNodeTypes()
@@ -290,7 +291,7 @@ TW.conf = (function(TW){
  // relative sizes (iff ChangeType == both nodetypes)
  TWConf.sizeMult = [];
  TWConf.sizeMult[0] = 1.0;     // ie for node type 0 (<=> sem)
-  TWConf.sizeMult[1] = 3.0;    // ie for node type 1 (<=> soc)
+  TWConf.sizeMult[1] = 2.0;    // ie for node type 1 (<=> soc)
  // ===========

--- a/tinawebJS/main.js
+++ b/tinawebJS/main.js
@@ -13,7 +13,9 @@ TW.partialGraph = null  // will contain the sigma visible graph instance
 TW.labels=[];           // fulltext search list
 TW.gexfPaths={};        // for file selectors iff servermenu
-TW.fields={};           // for related db tablenames
+TW.relDocsInfos={};           // map [graphsource => relatedDocs db fields or tables names]
+                        // TODO requires specifications !!
                        //  (iff servermenu && relatedDocsType == 'wosLocalDB')
 TW.categories = [];     // possible node types and their inverted map
@@ -258,6 +260,7 @@ function syncRemoteGraphData () {
        for( var path in preRES.data ) {
            var theGexfs = preRES.data[path]["gexfs"]
            for(var aGexf in theGexfs) {
                var gexfBasename = aGexf.replace(/\.gexf$/, "") // more human-readable in the menu
                TW.gexfPaths[gexfBasename] = path+"/"+aGexf
@@ -271,16 +274,41 @@ function syncRemoteGraphData () {
                // for associated wosLocalDBs sql queries
                if (theGexfs[aGexf]) {
-                  TW.fields[path+"/"+aGexf] = {"semantic":null, "social":null}
+                  let gSrcEntry = theGexfs[aGexf]
-                  if (theGexfs[aGexf]["semantic"] && theGexfs[aGexf]["semantic"]["table"]) {
-                    TW.fields[path+"/"+aGexf]['semantic'] = theGexfs[aGexf]["semantic"]["table"]
+                  TW.relDocsInfos[path+"/"+aGexf] = {"semantic":null, "social":null, "dbtype": null}
+                  // POSS have this type attribute in db.json *for all the entries*
+                  // ----------------------------------------------------------------------------------
+                  // choice: we'll keep a flat structure by source unless some use cases need otherwise
+                  // ----------------------------------------------------------------------------------
+                  // csv LocalDB ~ gargantext
+                  if(gSrcEntry["dbtype"] && gSrcEntry["dbtype"] == "csv") {
+                    TW.relDocsInfos[path+"/"+aGexf]['dbtype'] = "csv"
+                    // it's CSV columns here
+                    TW.relDocsInfos[path+"/"+aGexf]['semantic'] = gSrcEntry["semantic"]
+                    TW.relDocsInfos[path+"/"+aGexf]['social'] = gSrcEntry["social"]
                  }
-                  if (theGexfs[aGexf]["social"] && theGexfs[aGexf]["social"]["table"]) {
-                    TW.fields[path+"/"+aGexf]['social'] = theGexfs[aGexf]["social"]["table"]
+                  // sqlite LocalDB ~ wos
+                  else {
+                    TW.relDocsInfos[path+"/"+aGexf]['dbtype'] = "sql"
+                    if (theGexfs[aGexf]["semantic"] && theGexfs[aGexf]["semantic"]["table"]) {
+                      TW.relDocsInfos[path+"/"+aGexf]['semantic'] = theGexfs[aGexf]["semantic"]["table"]
+                    }
+                    if (theGexfs[aGexf]["social"] && theGexfs[aGexf]["social"]["table"]) {
+                      TW.relDocsInfos[path+"/"+aGexf]['social'] = theGexfs[aGexf]["social"]["table"]
+                    }
                  }
+                  console.log("TW.relDocsInfos", TW.relDocsInfos)
                }
                else {
-                  TW.fields[path+"/"+aGexf] = null
+                  TW.relDocsInfos[path+"/"+aGexf] = null
                }
                // ^^^^^^ FIXME see if we got the expected behavior right
                //             (? specifications ?)