Commit 0234ddbd authored by Romain Loth's avatar Romain Loth

ISTEX parsing reactivated + stop hyperdata indexation on text for the moment

parent 54976e29
......@@ -93,19 +93,19 @@ INDEXED_HYPERDATA = {
, 'convert_from_db': str
},
'text':
{ 'id' : 7
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'page':
{ 'id' : 8
, 'type' : int
, 'convert_to_db' : int
, 'convert_from_db': int
},
# 'text':
# { 'id' : 7
# , 'type' : str
# , 'convert_to_db' : str
# , 'convert_from_db': str
# },
#
# 'page':
# { 'id' : 8
# , 'type' : int
# , 'convert_to_db' : int
# , 'convert_from_db': int
# },
}
......@@ -181,7 +181,7 @@ DEFAULT_TFIDF_CUTOFF_RATIO = .45 # MAINLIST maximum terms in %
DEFAULT_TFIDF_HARD_LIMIT = 750 # MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD = 3 # inclusive minimum for COOCS coefs
DEFAULT_COOC_THRESHOLD = 2 # inclusive minimum for COOCS coefs
# (makes COOCS more sparse)
DEFAULT_MAPLIST_MAX = 300 # MAPLIST maximum terms
......
......@@ -18,11 +18,11 @@ class CSVParser(Parser):
return Freqs
def parse(self, filename):
def parse(self, filebuf):
print("CSV: parsing (assuming UTF-8 and LF line endings)")
contents = filename.read().decode("UTF-8").split("\n")
contents = filebuf.read().decode("UTF-8").split("\n")
sample_size = 10
sample_contents = contents[0:sample_size]
......
......@@ -5,10 +5,10 @@ import json
class ISTexParser(Parser):
def parse(self, thefile):
json_data=open(thefile,"r")
data = json.load(json_data)
json_data.close()
def parse(self, filebuf):
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
filebuf.close()
json_docs = data["hits"]
hyperdata_list = []
hyperdata_path = {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment