$cols) {
$postings[$nodetype] = array() ;
// echodump("parse_and_index_csv: typed cols", $cols);
for($i = 0; $i < count($cols) ; $i++) {
$colname = $cols[$i.""];
$postings[$nodetype][$colname] = array();
}
}
// we'll initialize colnum => colname map from first row
$colnames = array() ;
$rowid = 0;
if (($fh = fopen($filename, "r")) !== FALSE) {
// we assume first line is titles
$colnames = fgetcsv($fh, 20000, $separator, $quotechar);
// we slurp and index the entire CSV
while (($line_fields = fgetcsv($fh, 20000, $separator, $quotechar)) !== FALSE) {
// NB 2nd arg is max length of line
// we used here 2 * the longest we saw in the exemples
// (change accordingly to your use cases)
$num = count($line_fields);
// echo "
$num fields in line $rowid:
\n";
$docid = 'd'.$rowid;
// keep the row in "database"
$base[$rowid] = array();
for ($c=0; $c < $num; $c++) {
$colname = $colnames[$c];
// debug
// echo "==>/".$colname."/:" . $line_fields[$c] . "
\n";
// store row -> fields -> value
$base[$rowid][$colname] = $line_fields[$c];
// fill our search index if the type+col was asked in postings
foreach (['semantic', 'social'] as $swtype){
if (array_key_exists($swtype, $postings)) {
if (array_key_exists($colname, $postings[$swtype])) {
// basic tokenisation (TODO specify tokenisation delimiters etc.)
$tokens = preg_split("/\W/", $line_fields[$c]);
// for debug
// echo("indexing column:".$colname." under type:".$swtype.'
');
// var_dump($tokens);
foreach($tokens as $tok) {
if (strlen($tok)) {
// POSS : stopwords could be used here
if (! array_key_exists($tok, $postings[$swtype][$colname])) {
$postings[$swtype][$colname][$tok] = array();
}
// in a csv, rowid is a pointer to the document
if (array_key_exists($docid, $postings[$swtype][$colname][$tok])) {
// we keep the frequencies
$postings[$swtype][$colname][$tok][$docid]++ ;
}
else {
$postings[$swtype][$colname][$tok][$docid] = 1;
}
}
}
}
}
}
}
$rowid++;
}
fclose($fh);
}
return array($base, $postings);
}
?>