$col => $tok => $docid => $occs // // // function parse_and_index_csv($filename, $typed_cols_to_index, $separator, $quotechar) { // list of csv rows $base = array(); // initialize our inverted index by values $postings = array() ; foreach($typed_cols_to_index as $nodetype => $cols) { $postings[$nodetype] = array() ; // echodump("parse_and_index_csv: typed cols", $cols); for($i = 0; $i < count($cols) ; $i++) { $colname = $cols[$i.""]; $postings[$nodetype][$colname] = array(); } } // we'll initialize colnum => colname map from first row $colnames = array() ; $rowid = 0; if (($fh = fopen($filename, "r")) !== FALSE) { // we assume first line is titles $colnames = fgetcsv($fh, 20000, $separator, $quotechar); // we slurp and index the entire CSV while (($line_fields = fgetcsv($fh, 20000, $separator, $quotechar)) !== FALSE) { // NB 2nd arg is max length of line // we used here 2 * the longest we saw in the exemples // (change accordingly to your use cases) $num = count($line_fields); // echo "

$num fields in line $rowid:

\n"; $docid = 'd'.$rowid; // keep the row in "database" $base[$rowid] = array(); for ($c=0; $c < $num; $c++) { $colname = $colnames[$c]; // debug // echo "==>/".$colname."/:" . $line_fields[$c] . "
\n"; // store row -> fields -> value $base[$rowid][$colname] = $line_fields[$c]; // fill our search index if the type+col was asked in postings for ($ndtypeid = 0 ; $ndtypeid < $GLOBALS["ntypes"] ; $ndtypeid++) { if (array_key_exists($ndtypeid, $postings)) { if (array_key_exists($colname, $postings[$ndtypeid])) { // basic tokenisation on unicode punctuation and separators // cf http://unicode.org/reports/tr18/#General_Category_Property $tokens = preg_split("/[\p{Z}\p{P}\p{C}]+/u", $line_fields[$c]); // for debug // echo("indexing column:".$colname." under type:".$ndtypeid.'
'); // var_dump($tokens); foreach($tokens as $tok) { $tok = strtolower($tok); if (strlen($tok)) { // POSS : stopwords could be used here if (! array_key_exists($tok, $postings[$ndtypeid][$colname])) { $postings[$ndtypeid][$colname][$tok] = array(); } // in a csv, rowid is a pointer to the document if (array_key_exists($docid, $postings[$ndtypeid][$colname][$tok])) { // we keep the frequencies $postings[$ndtypeid][$colname][$tok][$docid]++ ; } else { $postings[$ndtypeid][$colname][$tok][$docid] = 1; } } } } } } } $rowid++; } fclose($fh); } // post-treatment: cumulative number of docs by token $df = array() ; for ($ndtypeid = 0 ; $ndtypeid < $GLOBALS["ntypes"] ; $ndtypeid++) { if (array_key_exists($ndtypeid, $postings)) { foreach ($postings[$ndtypeid] as $col => $occs_matrix) { foreach ($occs_matrix as $tok => $doc_occs) { if (array_key_exists($tok, $df)) { $df[$tok] += count($doc_occs); } else { $df[$tok] = count($doc_occs); } } } } } $logtotaldocs = log($rowid + 1); $idfvals = array(); foreach ($df as $tok => $df_tok) { $idfvals[$tok] = $logtotaldocs - log($df_tok); } return array($base, $postings, $idfvals); } ?>