Commit c00c5309 authored by Romain Loth's avatar Romain Loth

phpAPI: (csv index) use tfidf as score instead of sum(freq)

parent e74b8d26
......@@ -48,7 +48,7 @@ function displayDoc($docId, $score, $base, $outmode) {
"kws" => $keywords,
"txt" => $doccontent,
"date" => $date,
"score" => $score
"score" => round($score, 5)
);
}
return $output;
......@@ -72,14 +72,14 @@ function try_attrs_until_you_find($doc_obj, $attr_names_array) {
$htmlout = "<ul class=infoitems>\n";
$jsonout = array();
$nb_displayed = 0;
foreach ($sims as $doc => $freq) {
foreach ($sims as $doc => $score) {
// doc limit
if ($nb_displayed > $max_item_displayed - 1) {
break;
}
$rowid = ltrim($doc, 'd');
$thisdoc = displayDoc($rowid, $freq, $base, $output_mode);
$thisdoc = displayDoc($rowid, $score, $base, $output_mode);
// echodump("doc", $thisdoc);
if ($output_mode == "html") {
......
......@@ -49,17 +49,7 @@
// },
//
//
// The postings have the form: {
// col_i => {
// "tokenA" => {
// docid0: occs i.A.0,
// docid1: occs i.A.1,
// ...
// },
// ...
// },
// ...
// }
// The postings have the form: $nodetype => $col => $tok => $docid => $occs
//
//
//
......@@ -154,8 +144,30 @@ function parse_and_index_csv($filename, $typed_cols_to_index, $separator, $quote
}
fclose($fh);
}
// post-treatment: cumulative number of docs by token
$df = array() ;
for ($ndtypeid = 0 ; $ndtypeid < $GLOBALS["ntypes"] ; $ndtypeid++) {
if (array_key_exists($ndtypeid, $postings)) {
foreach ($postings[$ndtypeid] as $col => $occs_matrix) {
foreach ($occs_matrix as $tok => $doc_occs) {
if (array_key_exists($tok, $df)) {
$df[$tok] += count($doc_occs);
}
else {
$df[$tok] = count($doc_occs);
}
}
}
}
}
$logtotaldocs = log($rowid + 1);
$idfvals = array();
foreach ($df as $tok => $df_tok) {
$idfvals[$tok] = $logtotaldocs - log($df_tok);
}
return array($base, $postings);
return array($base, $postings, $idfvals);
}
......
......@@ -90,8 +90,10 @@ else {
$base = $csv_search_base[0];
$postings = $csv_search_base[1];
$idfvals = $csv_search_base[2];
// echodump("postings", $postings);
// echodump("base", $base);
// echodump("idfvals", $idfvals);
// DO THE SEARCH
......@@ -130,16 +132,16 @@ else {
// matches
$matching_docs = $searchable[$tok];
foreach ($matching_docs as $doc_id => $freq) {
foreach ($matching_docs as $doc_id => $tf) {
// echodump("tok freq in this doc", $freq);
// echodump("tok freq in this doc", $tf);
// cumulated freq of tokens per doc
if (array_key_exists($doc_id, $sims)) {
$sims[$doc_id]++;
$sims[$doc_id] += $tf * $idfvals[$tok];
}
else {
$sims[$doc_id] = 1;
$sims[$doc_id] = $tf * $idfvals[$tok];
}
}
}
......
......@@ -15,7 +15,6 @@ function echodump($title, $anyObj, $output_mode = "json") {
}
}
function errmsg($message, $context, $more = "") {
echo "<p class='micromessage'>The relatedDocs DB conf for $context $message
(please read A-Introduction/project_config.md).<br>$more</p>";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment