Commit c00c5309 authored by Romain Loth's avatar Romain Loth

phpAPI: (csv index) use tfidf as score instead of sum(freq)

parent e74b8d26
...@@ -48,7 +48,7 @@ function displayDoc($docId, $score, $base, $outmode) { ...@@ -48,7 +48,7 @@ function displayDoc($docId, $score, $base, $outmode) {
"kws" => $keywords, "kws" => $keywords,
"txt" => $doccontent, "txt" => $doccontent,
"date" => $date, "date" => $date,
"score" => $score "score" => round($score, 5)
); );
} }
return $output; return $output;
...@@ -72,14 +72,14 @@ function try_attrs_until_you_find($doc_obj, $attr_names_array) { ...@@ -72,14 +72,14 @@ function try_attrs_until_you_find($doc_obj, $attr_names_array) {
$htmlout = "<ul class=infoitems>\n"; $htmlout = "<ul class=infoitems>\n";
$jsonout = array(); $jsonout = array();
$nb_displayed = 0; $nb_displayed = 0;
foreach ($sims as $doc => $freq) { foreach ($sims as $doc => $score) {
// doc limit // doc limit
if ($nb_displayed > $max_item_displayed - 1) { if ($nb_displayed > $max_item_displayed - 1) {
break; break;
} }
$rowid = ltrim($doc, 'd'); $rowid = ltrim($doc, 'd');
$thisdoc = displayDoc($rowid, $freq, $base, $output_mode); $thisdoc = displayDoc($rowid, $score, $base, $output_mode);
// echodump("doc", $thisdoc); // echodump("doc", $thisdoc);
if ($output_mode == "html") { if ($output_mode == "html") {
......
...@@ -49,17 +49,7 @@ ...@@ -49,17 +49,7 @@
// }, // },
// //
// //
// The postings have the form: { // The postings have the form: $nodetype => $col => $tok => $docid => $occs
// col_i => {
// "tokenA" => {
// docid0: occs i.A.0,
// docid1: occs i.A.1,
// ...
// },
// ...
// },
// ...
// }
// //
// //
// //
...@@ -154,8 +144,30 @@ function parse_and_index_csv($filename, $typed_cols_to_index, $separator, $quote ...@@ -154,8 +144,30 @@ function parse_and_index_csv($filename, $typed_cols_to_index, $separator, $quote
} }
fclose($fh); fclose($fh);
} }
// post-treatment: cumulative number of docs by token
$df = array() ;
for ($ndtypeid = 0 ; $ndtypeid < $GLOBALS["ntypes"] ; $ndtypeid++) {
if (array_key_exists($ndtypeid, $postings)) {
foreach ($postings[$ndtypeid] as $col => $occs_matrix) {
foreach ($occs_matrix as $tok => $doc_occs) {
if (array_key_exists($tok, $df)) {
$df[$tok] += count($doc_occs);
}
else {
$df[$tok] = count($doc_occs);
}
}
}
}
}
$logtotaldocs = log($rowid + 1);
$idfvals = array();
foreach ($df as $tok => $df_tok) {
$idfvals[$tok] = $logtotaldocs - log($df_tok);
}
return array($base, $postings); return array($base, $postings, $idfvals);
} }
......
...@@ -90,8 +90,10 @@ else { ...@@ -90,8 +90,10 @@ else {
$base = $csv_search_base[0]; $base = $csv_search_base[0];
$postings = $csv_search_base[1]; $postings = $csv_search_base[1];
$idfvals = $csv_search_base[2];
// echodump("postings", $postings); // echodump("postings", $postings);
// echodump("base", $base); // echodump("base", $base);
// echodump("idfvals", $idfvals);
// DO THE SEARCH // DO THE SEARCH
...@@ -130,16 +132,16 @@ else { ...@@ -130,16 +132,16 @@ else {
// matches // matches
$matching_docs = $searchable[$tok]; $matching_docs = $searchable[$tok];
foreach ($matching_docs as $doc_id => $freq) { foreach ($matching_docs as $doc_id => $tf) {
// echodump("tok freq in this doc", $freq); // echodump("tok freq in this doc", $tf);
// cumulated freq of tokens per doc // cumulated freq of tokens per doc
if (array_key_exists($doc_id, $sims)) { if (array_key_exists($doc_id, $sims)) {
$sims[$doc_id]++; $sims[$doc_id] += $tf * $idfvals[$tok];
} }
else { else {
$sims[$doc_id] = 1; $sims[$doc_id] = $tf * $idfvals[$tok];
} }
} }
} }
......
...@@ -15,7 +15,6 @@ function echodump($title, $anyObj, $output_mode = "json") { ...@@ -15,7 +15,6 @@ function echodump($title, $anyObj, $output_mode = "json") {
} }
} }
function errmsg($message, $context, $more = "") { function errmsg($message, $context, $more = "") {
echo "<p class='micromessage'>The relatedDocs DB conf for $context $message echo "<p class='micromessage'>The relatedDocs DB conf for $context $message
(please read A-Introduction/project_config.md).<br>$more</p>"; (please read A-Introduction/project_config.md).<br>$more</p>";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment