Commit fcf07e92 authored by Romain Loth's avatar Romain Loth

WIP: CSV local db API

parent 93d0c79e
<?php
// parse_and_index_csv: index a subset of csv columns for search
// --------------------------------------------------------------
// returns the full csv array (the documents base)
// AND a list of postings (the search index)
function parse_and_index_csv($filename, $typed_cols_to_index, $separator, $quotechar) {
// list of csv rows
$base = array();
// initialize our inverted index by values
$postings = array() ;
foreach($typed_cols_to_index as $nodetype => $cols) {
$postings[$nodetype] = array() ;
// echodump("parse_and_index_csv: typed cols", $cols);
for($i = 0; $i < count($cols) ; $i++) {
$colname = $cols[$i.""];
$postings[$nodetype][$colname] = array();
}
}
// we'll initialize colnum => colname map from first row
$colnames = array() ;
$rowid = 0;
if (($fh = fopen($filename, "r")) !== FALSE) {
// we assume first line is titles
$colnames = fgetcsv($fh, 20000, $separator, $quotechar);
// we slurp and index the entire CSV
while (($line_fields = fgetcsv($fh, 20000, $separator, $quotechar)) !== FALSE) {
// NB 2nd arg is max length of line
// we used here 2 * the longest we saw in the exemples
// (change accordingly to your use cases)
$num = count($line_fields);
// echo "<p> $num fields in line $rowid: <br /></p>\n";
// keep the row in "database"
$base[$rowid] = array();
for ($c=0; $c < $num; $c++) {
$colname = $colnames[$c];
// debug
// echo "==>/".$colname."/:" . $line_fields[$c] . "<br />\n";
// store row -> fields -> value
$base[$rowid][$colname] = $line_fields[$c];
// fill our search index if the type+col was asked in postings
foreach (['semantic', 'social'] as $swtype){
if (array_key_exists($swtype, $postings)) {
if (array_key_exists($colname, $postings[$swtype])) {
// basic tokenisation (TODO specify tokenisation delimiters etc.)
$tokens = preg_split("/\W/", $line_fields[$c]);
// for debug
// echo("indexing column:".$colname." under type:".$swtype.'<br>');
// var_dump($tokens);
foreach($tokens as $tok) {
if (strlen($tok)) {
// POSS : stopwords could be used here
if (! array_key_exists($tok, $postings[$swtype][$colname])) {
$postings[$swtype][$colname][$tok] = array();
}
// rowid is a pointer to the document
array_push($postings[$swtype][$colname][$tok], $rowid);
}
}
}
}
}
}
$rowid++;
}
fclose($fh);
}
return array($base, $postings);
}
?>
<?php <?php
// manage the dynamical additional information in the left panel. // manage the dynamical additional information in the left panel.
ini_set('display_errors',1);
ini_set('display_startup_errors',1);
error_reporting(-1);
// ini_set('display_errors',1); // relative path to dirname "/line"
// ini_set('display_startup_errors',1); $project_root = "../";
// error_reporting(-1);
// exemple call: // exemple call:
// http://blabla/LOCALDB/info_div.php?type=semantic&bi=0&query=[%22Monte%20Carlo%22]&gexf=%22data/AXA/RiskV2PageRank1000.gexf%22&index=ISItermsAxa_2015 // http://blabla/LOCALDB/info_div.php?type=semantic&bi=0&query=[%22Monte%20Carlo%22]&gexf=%22line/AXA/RiskV2PageRank1000.gexf%22&index=ISItermsAxa_2015
include('parameters_details.php'); include('parameters_details.php');
$max_item_displayed=6; $max_item_displayed=6;
// echo('graphdb: '. $graphdb.'<br/>'); if ($_GET['dbtype'] == "sql") {
$base = new PDO("sqlite:".$project_root.$graphdb);
$base = new PDO("sqlite:../" .$graphdb); include('default_div.php');
}
include('default_div.php');
else {
// to index: the "searchable columns"
/* if (! array_key_exists('toindex', $_GET)) {
* This function gets the first db name in the data folder echo('<br> info_div.php (csv mode): please provide columns to index <br>');
* IT'S NOT SCALABLE! (If you want to use several db's) }
*/ else {
function getDB ($directory) { $idxcolsbytype = json_decode($_GET['toindex']);
//$results = array();
$result = ""; echodump("columns to index",$idxcolsbytype);
$handler = opendir($directory);
while ($file = readdir($handler)) { include('csv_indexation.php');
if ($file != "." && $file != ".."
&& // DO THE INDEXATION
((strpos($file,'.db~'))==false && (strpos($file,'.db'))==true ) // we use cache if memcached is present (and if we indexed the csv already)
|| // $can_use_cache = False
((strpos($file,'.sqlite~'))==false && (strpos($file,'.sqlite'))==true)
) { // £TODO use memcached or something to store a serialized version of csv_search_base
//$results[] = $file; // + add all (sem+soc) columns for the index to work !!
$result = $file; $csv_search_base = parse_and_index_csv($project_root.$graphdb, $idxcolsbytype, ";", '"');
break;
$base = $csv_search_base[0];
$postings = $csv_search_base[1];
echodump("postings", $postings);
echodump("base", $base);
// DO THE SEARCH
$searchcols = json_decode($_GET['searchin']);
// a - split the query
$qtokens = preg_split('/\W/', $_GET["query"]);
// b - compute tfidfs
$tfs_per_tok_and_doc = array();
$dfs_per_tok = array();
// for each token
for ($k=0 ; $k < count($qtokens) ; $k) {
$tok = $qtokens[$k];
$tfs_per_tok_and_doc[$tok] = array();
for ($l=0 ; $l < count($searchcols) ; $l++) {
$searchable = $postings[$_GET['type']][$searchcols[$l]];
echodump("searchable", $searchable);
//
// if (array_key_exists($tok, $searchable)) {
// for ($m ; $m < count($searchable[$tok]) ; $m++) {
// $doc_id = $searchable[$tok][$m];
//
// // freq of token per doc
// if (array_key_exists($doc_id, $tfs_per_tok_and_doc[$tok])) {
// $tfs_per_tok_and_doc[$tok][$doc_id]++;
// }
// else {
// $tfs_per_tok_and_doc[$tok][$doc_id] = 1;
// }
//
// // global doc freqs
// if (array_key_exists($tok, $dfs_per_tok)) {
// $dfs_per_tok[$tok]++;
// }
// else {
// $dfs_per_tok[$tok] = 1;
// }
// }
// }
} }
// $qtokens[k];
} }
closedir($handler);
//return $results; // c - score per doc
return $result; //
// $nbdoc = count($base);
// for ($i=0; $i < $nbdoc; $i++) {
// }
// dfs = array();
// DISPLAY THE RESULTS
// function displayDoc($docId, $score, $base) {
//
// // POSS score should have a data-score attribute
// $output ="<li title='".$score."'>";
//
// $output.="<p><b>".$base[$docId]['title']."</b></p>"
// $output.="<p>".$base[$docId]['author']." [".$base[$docId]['pubdate']."], <i>(".$base[$docId]['journal'].")</i></p>";
// $output.="<p>".$base[$docId]['keywords']."</p>";
//
// $output.="</li>";
//
// return $output
// }
}
} }
?> ?>
...@@ -4,16 +4,20 @@ ...@@ -4,16 +4,20 @@
# loading an associated db for a given gexf as relatedDocs php API # loading an associated db for a given gexf as relatedDocs php API
$gexf_db = array(); $gexf_db = array();
# $gexf_db["data/terrorism/terrorism_bi.gexf"] = "data/terrorism/data.db"; # £££TODO should be passed by param
# $gexf_db["data/ClimateChange/ClimateChangeV1.gexf"] = "data/ClimateChange/wosclimatechange-61715-1-wosclimatechange-db(2).db";
$gexf_db["data/ClimateChange/Maps_S_800.gexf"] = "data/ClimateChange/wos_climate-change_title_2014-2015.db"; $gexf_db["data/ClimateChange/Maps_S_800.gexf"] = "data/ClimateChange/wos_climate-change_title_2014-2015.db";
$gexf_db["data/AXA/RiskV2PageRank1000.gexf"] = "data/AXA/data.db"; $gexf_db["data/AXA/RiskV2PageRank1000.gexf"] = "data/AXA/data.db";
$gexf_db["data/AXA/RiskV2PageRank2500.gexf"] = "data/AXA/data.db"; $gexf_db["data/AXA/RiskV2PageRank2500.gexf"] = "data/AXA/data.db";
$gexf_db["data/AXA/RiskV2PageRank5000.gexf"] = "data/AXA/data.db"; $gexf_db["data/AXA/RiskV2PageRank5000.gexf"] = "data/AXA/data.db";
$gexf_db["data/test/mini_for_csv.gexf"] = "data/test/mini_for_csv.tsv";
// TESTS // TESTS
// for debug
echo "<br>";
var_dump($_GET);
echo "<br>---<br>";
// $gexf_db["data/ProgrammeDesCandidats.gexf"] = "foobar"; // $gexf_db["data/ProgrammeDesCandidats.gexf"] = "foobar";
$gexf= str_replace('"','',$_GET["gexf"]); $gexf= str_replace('"','',$_GET["gexf"]);
...@@ -21,5 +25,12 @@ $gexf= str_replace('"','',$_GET["gexf"]); ...@@ -21,5 +25,12 @@ $gexf= str_replace('"','',$_GET["gexf"]);
$mainpath=dirname(getcwd())."/"; $mainpath=dirname(getcwd())."/";
$graphdb = $gexf_db[$gexf]; $graphdb = $gexf_db[$gexf];
echodump("graphdb", $graphdb);
function echodump($title, $anyObj) {
echo "<br>".$title.": ";
echo (json_encode($anyObj, JSON_PRETTY_PRINT));
echo "<br>";
}
?> ?>
<gexf xmlns="http://www.gexf.net/1.1draft" xmlns:viz="http://www.gephi.org/gexf/viz" version="1.1">
<graph defaultedgetype="undirected" type="static">
<attributes class="node" type="static">
<attribute id="0" title="category" type="string"> </attribute>
</attributes>
<nodes>
<node id="T::0" label="scalajs">
<viz:size value="1"/>
<attvalues>
<attvalue for="0" value="term"/>
</attvalues>
</node>
<node id="T::1" label="scala">
<viz:size value="1"/>
<attvalues>
<attvalue for="0" value="term"/>
</attvalues>
</node>
<node id="T::2" label="python">
<viz:size value="1"/>
<attvalues>
<attvalue for="0" value="term"/>
</attvalues>
</node>
<node id="T::3" label="javascript">
<viz:size value="1"/>
<attvalues>
<attvalue for="0" value="term"/>
</attvalues>
</node>
<node id="T::4" label="php">
<viz:size value="1"/>
<attvalues>
<attvalue for="0" value="term"/>
</attvalues>
</node>
<node id="P::0" label="-bb-">
<viz:size value="1"/>
<attvalues>
<attvalue for="0" value="person"/>
</attvalues>
</node>
<node id="P::1" label="-aa-">
<viz:size value="1"/>
<attvalues>
<attvalue for="0" value="person"/>
</attvalues>
</node>
<node id="P::2" label="-dd-">
<viz:size value="1"/>
<attvalues>
<attvalue for="0" value="person"/>
</attvalues>
</node>
<node id="P::3" label="-cc-">
<viz:size value="1"/>
<attvalues>
<attvalue for="0" value="person"/>
</attvalues>
</node>
</nodes>
<edges>
<!-- bipartite 1|1 -->
<edge id="1" source="T::1" target="P::1"></edge>
<edge id="2" source="T::1" target="P::2"></edge>
<edge id="3" source="T::2" target="P::3"></edge>
<edge id="4" source="T::3" target="P::2"></edge>
<edge id="5" source="T::3" target="P::3"></edge>
<edge id="6" source="T::4" target="P::0"></edge>
<edge id="7" source="T::4" target="P::3"></edge>
<!-- intra sem 1|0 -->
<edge id="8" source="T::2" target="T::3"></edge>
<edge id="9" source="T::3" target="T::4"></edge>
<edge id="10" source="T::0" target="T::1"></edge>
<edge id="10" source="T::0" target="T::3"></edge>
<!-- intra soc 0|1 -->
<edge id="10" source="P::0" target="P::1"></edge>
<edge id="11" source="P::1" target="P::2"></edge>
<edge id="12" source="P::2" target="P::3"></edge>
</edges>
</graph>
</gexf>
{ {
"data/politoscope": { "data/test": {
"first" : "mini_for_csv.gexf",
"gexfs": {
"mini_for_csv.gexf": {
"_comment": "NB: underspecified for csv and for db.json !! so this is a prototype structure",
"_comment": "POSS: weighted columns for matching importance",
"dbtype": "csv",
"semantic":["title","keywords","text"],
"social":["author"],
"dbfile": "mini_for_csv.csv"
},
"test_with_various_atts.gexf": {}
}
},
"data/politoscope": {
"dbname":null, "dbname":null,
"title":"Politoscope", "title":"Politoscope",
"date":"2017", "date":"2017",
...@@ -33,11 +47,5 @@ ...@@ -33,11 +47,5 @@
"gexfs": { "gexfs": {
"graph_example.json": {} "graph_example.json": {}
} }
},
"data/test": {
"first" : "test_with_various_atts.gexf",
"gexfs": {
"test_with_various_atts.gexf": {}
}
} }
} }
...@@ -11,7 +11,6 @@ TW.gui.colorFuns = { ...@@ -11,7 +11,6 @@ TW.gui.colorFuns = {
'cluster': "clusterColoring" 'cluster': "clusterColoring"
} }
// Execution: changeGraphAppearanceByFacets( true ) // Execution: changeGraphAppearanceByFacets( true )
// It reads scanned node-attributes and prepared legends in TW.Clusters // It reads scanned node-attributes and prepared legends in TW.Clusters
// to add the button in the html with the sigmaUtils.gradientColoring(x) listener. // to add the button in the html with the sigmaUtils.gradientColoring(x) listener.
...@@ -420,8 +419,12 @@ function topPapersFetcher(swType, qWords, priorHtml, cbNext){ ...@@ -420,8 +419,12 @@ function topPapersFetcher(swType, qWords, priorHtml, cbNext){
let stockErrMsg = `<p class="micromessage"> let stockErrMsg = `<p class="micromessage">
Your settings for relatedDocsType are set on ${TW.conf.relatedDocsType} Your settings for relatedDocsType are set on ${TW.conf.relatedDocsType}
API but it couldn't be connected to.<br >Check if it is running and API but it couldn't be connected to.</p>`
if (TW.conf.relatedDocsType == "api") {
stockErrMsg += `<p class="micromessage">Check if it is running and
accessible:<br><span class=code>${TW.conf.relatedDocsAPI}</span></p>` accessible:<br><span class=code>${TW.conf.relatedDocsAPI}</span></p>`
}
let resHTML = '' let resHTML = ''
...@@ -453,7 +456,7 @@ function topPapersFetcher(swType, qWords, priorHtml, cbNext){ ...@@ -453,7 +456,7 @@ function topPapersFetcher(swType, qWords, priorHtml, cbNext){
}); });
} }
else if (TW.conf.relatedDocsType == "wosLocalDB") { else if (TW.conf.relatedDocsType == "wosLocalDB") {
let gexfinfos = TW.fields[TW.File] let gexfinfos = TW.relDocsInfos[TW.File]
if (!gexfinfos || !gexfinfos[swType]) { if (!gexfinfos || !gexfinfos[swType]) {
resHTML = resHTML =
`<p>Your settings for relatedDocsType are set on a local wos database, `<p>Your settings for relatedDocsType are set on a local wos database,
...@@ -464,15 +467,40 @@ function topPapersFetcher(swType, qWords, priorHtml, cbNext){ ...@@ -464,15 +467,40 @@ function topPapersFetcher(swType, qWords, priorHtml, cbNext){
return return
} }
else { else {
// /!\ documentation and specification needed for the php use cases /!\
let joinedQ = JSON.stringify(qWords).split('&').join('__and__'); let joinedQ = JSON.stringify(qWords).split('&').join('__and__');
let querytable = gexfinfos[swType] // cf. the php code for these url args:
let urlParams = "type="+swType+"&query="+joinedQ+"&gexf="+TW.File+"&index="+querytable+"&n="+TW.conf.relatedDocsMax // - type: the node type (social/semantic)
// - dbtype: 'sql' (classic sqlite like wos)
// or 'csv' (like gargantext exports)
// POSS object + join.map(join)
let urlParams = "type="+swType+"&query="+joinedQ+"&gexf="+TW.File+"&n="+TW.conf.relatedDocsMax+"&dbtype="+gexfinfos.dbtype
if (gexfinfos.dbtype == "sql") {
var qIndex = gexfinfos[swType] // a table
urlParams += `&index=${qIndex}`
}
else {
// a list of csv columns to search in
// ex: for semantic nodes matching we look in 'title', 'keywords' cols
// for social nodes matching we look in 'authors' col... etc.
let joinedSearchCols = JSON.stringify(gexfinfos[swType])
urlParams += `&searchin=${joinedSearchCols}`
let joinedAllCols = JSON.stringify(gexfinfos)
urlParams += `&toindex=${joinedAllCols}`
// POSS use a direct access from php to db.json to avoid toindex
// POSS make it a REST array like: index[]=title&index[]=keywords
}
$.ajax({ $.ajax({
type: 'GET', type: 'GET',
url: 'LOCALDB/info_div.php', url: TW.conf.relatedDocsAPI + '/info_div.php',
data: urlParams, data: urlParams,
success : function(data){ success : function(data){
// console.log('relatedDocs: LOCALDB/info_div.php?'+ urlParams); console.log(`relatedDocs: ${TW.conf.relatedDocsAPI}/info_div.php?${urlParams}`);
resHTML = data resHTML = data
cbNext(priorHtml + resHTML) cbNext(priorHtml + resHTML)
}, },
...@@ -481,6 +509,12 @@ function topPapersFetcher(swType, qWords, priorHtml, cbNext){ ...@@ -481,6 +509,12 @@ function topPapersFetcher(swType, qWords, priorHtml, cbNext){
cbNext(priorHtml + stockErrMsg) cbNext(priorHtml + stockErrMsg)
} }
}); });
} }
} }
} }
......
...@@ -36,11 +36,12 @@ TW.conf = (function(TW){ ...@@ -36,11 +36,12 @@ TW.conf = (function(TW){
TWConf.getRelatedDocs = true TWConf.getRelatedDocs = true
TWConf.relatedDocsMax = 10 TWConf.relatedDocsMax = 10
TWConf.relatedDocsAPI = "http://127.0.0.1:5000/twitter_search"
TWConf.relatedDocsType = "wosLocalDB" // accepted: "twitter" | "wosLocalDB" TWConf.relatedDocsType = "wosLocalDB" // accepted: "twitter" | "wosLocalDB"
// POSSible: "elastic" // POSSible: "elastic"
// TWConf.relatedDocsAPI = "http://127.0.0.1:5000/twitter_search"
TWConf.relatedDocsAPI = "LOCALDB"
// ======================= // =======================
// DATA FACETS AND LEGENDS // DATA FACETS AND LEGENDS
// ======================= // =======================
...@@ -147,8 +148,8 @@ TW.conf = (function(TW){ ...@@ -147,8 +148,8 @@ TW.conf = (function(TW){
// ============= // =============
// Node typology: categories (resp. 0 and 1) will get these default labels // Node typology: categories (resp. 0 and 1) will get these default labels
TWConf.catSem = "Terms"; TWConf.catSem = "term";
TWConf.catSoc = "Document"; TWConf.catSoc = "person";
// NB: these labels may be superseded by the input data's node types values // NB: these labels may be superseded by the input data's node types values
// cf. sortNodeTypes() // cf. sortNodeTypes()
...@@ -290,7 +291,7 @@ TW.conf = (function(TW){ ...@@ -290,7 +291,7 @@ TW.conf = (function(TW){
// relative sizes (iff ChangeType == both nodetypes) // relative sizes (iff ChangeType == both nodetypes)
TWConf.sizeMult = []; TWConf.sizeMult = [];
TWConf.sizeMult[0] = 1.0; // ie for node type 0 (<=> sem) TWConf.sizeMult[0] = 1.0; // ie for node type 0 (<=> sem)
TWConf.sizeMult[1] = 3.0; // ie for node type 1 (<=> soc) TWConf.sizeMult[1] = 2.0; // ie for node type 1 (<=> soc)
// =========== // ===========
......
...@@ -13,7 +13,9 @@ TW.partialGraph = null // will contain the sigma visible graph instance ...@@ -13,7 +13,9 @@ TW.partialGraph = null // will contain the sigma visible graph instance
TW.labels=[]; // fulltext search list TW.labels=[]; // fulltext search list
TW.gexfPaths={}; // for file selectors iff servermenu TW.gexfPaths={}; // for file selectors iff servermenu
TW.fields={}; // for related db tablenames TW.relDocsInfos={}; // map [graphsource => relatedDocs db fields or tables names]
// TODO requires specifications !!
// (iff servermenu && relatedDocsType == 'wosLocalDB') // (iff servermenu && relatedDocsType == 'wosLocalDB')
TW.categories = []; // possible node types and their inverted map TW.categories = []; // possible node types and their inverted map
...@@ -258,6 +260,7 @@ function syncRemoteGraphData () { ...@@ -258,6 +260,7 @@ function syncRemoteGraphData () {
for( var path in preRES.data ) { for( var path in preRES.data ) {
var theGexfs = preRES.data[path]["gexfs"] var theGexfs = preRES.data[path]["gexfs"]
for(var aGexf in theGexfs) { for(var aGexf in theGexfs) {
var gexfBasename = aGexf.replace(/\.gexf$/, "") // more human-readable in the menu var gexfBasename = aGexf.replace(/\.gexf$/, "") // more human-readable in the menu
TW.gexfPaths[gexfBasename] = path+"/"+aGexf TW.gexfPaths[gexfBasename] = path+"/"+aGexf
...@@ -271,16 +274,41 @@ function syncRemoteGraphData () { ...@@ -271,16 +274,41 @@ function syncRemoteGraphData () {
// for associated wosLocalDBs sql queries // for associated wosLocalDBs sql queries
if (theGexfs[aGexf]) { if (theGexfs[aGexf]) {
TW.fields[path+"/"+aGexf] = {"semantic":null, "social":null} let gSrcEntry = theGexfs[aGexf]
if (theGexfs[aGexf]["semantic"] && theGexfs[aGexf]["semantic"]["table"]) {
TW.fields[path+"/"+aGexf]['semantic'] = theGexfs[aGexf]["semantic"]["table"] TW.relDocsInfos[path+"/"+aGexf] = {"semantic":null, "social":null, "dbtype": null}
// POSS have this type attribute in db.json *for all the entries*
// ----------------------------------------------------------------------------------
// choice: we'll keep a flat structure by source unless some use cases need otherwise
// ----------------------------------------------------------------------------------
// csv LocalDB ~ gargantext
if(gSrcEntry["dbtype"] && gSrcEntry["dbtype"] == "csv") {
TW.relDocsInfos[path+"/"+aGexf]['dbtype'] = "csv"
// it's CSV columns here
TW.relDocsInfos[path+"/"+aGexf]['semantic'] = gSrcEntry["semantic"]
TW.relDocsInfos[path+"/"+aGexf]['social'] = gSrcEntry["social"]
} }
if (theGexfs[aGexf]["social"] && theGexfs[aGexf]["social"]["table"]) {
TW.fields[path+"/"+aGexf]['social'] = theGexfs[aGexf]["social"]["table"] // sqlite LocalDB ~ wos
else {
TW.relDocsInfos[path+"/"+aGexf]['dbtype'] = "sql"
if (theGexfs[aGexf]["semantic"] && theGexfs[aGexf]["semantic"]["table"]) {
TW.relDocsInfos[path+"/"+aGexf]['semantic'] = theGexfs[aGexf]["semantic"]["table"]
}
if (theGexfs[aGexf]["social"] && theGexfs[aGexf]["social"]["table"]) {
TW.relDocsInfos[path+"/"+aGexf]['social'] = theGexfs[aGexf]["social"]["table"]
}
} }
console.log("TW.relDocsInfos", TW.relDocsInfos)
} }
else { else {
TW.fields[path+"/"+aGexf] = null TW.relDocsInfos[path+"/"+aGexf] = null
} }
// ^^^^^^ FIXME see if we got the expected behavior right // ^^^^^^ FIXME see if we got the expected behavior right
// (? specifications ?) // (? specifications ?)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment