Commit 0234ddbd authored by Romain Loth's avatar Romain Loth

ISTEX parsing reactivated + stop hyperdata indexation on text for the moment

parent 54976e29
...@@ -93,19 +93,19 @@ INDEXED_HYPERDATA = { ...@@ -93,19 +93,19 @@ INDEXED_HYPERDATA = {
, 'convert_from_db': str , 'convert_from_db': str
}, },
'text': # 'text':
{ 'id' : 7 # { 'id' : 7
, 'type' : str # , 'type' : str
, 'convert_to_db' : str # , 'convert_to_db' : str
, 'convert_from_db': str # , 'convert_from_db': str
}, # },
#
'page': # 'page':
{ 'id' : 8 # { 'id' : 8
, 'type' : int # , 'type' : int
, 'convert_to_db' : int # , 'convert_to_db' : int
, 'convert_from_db': int # , 'convert_from_db': int
}, # },
} }
...@@ -181,7 +181,7 @@ DEFAULT_TFIDF_CUTOFF_RATIO = .45 # MAINLIST maximum terms in % ...@@ -181,7 +181,7 @@ DEFAULT_TFIDF_CUTOFF_RATIO = .45 # MAINLIST maximum terms in %
DEFAULT_TFIDF_HARD_LIMIT = 750 # MAINLIST maximum terms abs DEFAULT_TFIDF_HARD_LIMIT = 750 # MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\) # (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD = 3 # inclusive minimum for COOCS coefs DEFAULT_COOC_THRESHOLD = 2 # inclusive minimum for COOCS coefs
# (makes COOCS more sparse) # (makes COOCS more sparse)
DEFAULT_MAPLIST_MAX = 300 # MAPLIST maximum terms DEFAULT_MAPLIST_MAX = 300 # MAPLIST maximum terms
......
...@@ -18,11 +18,11 @@ class CSVParser(Parser): ...@@ -18,11 +18,11 @@ class CSVParser(Parser):
return Freqs return Freqs
def parse(self, filename): def parse(self, filebuf):
print("CSV: parsing (assuming UTF-8 and LF line endings)") print("CSV: parsing (assuming UTF-8 and LF line endings)")
contents = filename.read().decode("UTF-8").split("\n") contents = filebuf.read().decode("UTF-8").split("\n")
sample_size = 10 sample_size = 10
sample_contents = contents[0:sample_size] sample_contents = contents[0:sample_size]
......
...@@ -5,10 +5,10 @@ import json ...@@ -5,10 +5,10 @@ import json
class ISTexParser(Parser): class ISTexParser(Parser):
def parse(self, thefile): def parse(self, filebuf):
json_data=open(thefile,"r") contents = filebuf.read().decode("UTF-8")
data = json.load(json_data) data = json.loads(contents)
json_data.close() filebuf.close()
json_docs = data["hits"] json_docs = data["hits"]
hyperdata_list = [] hyperdata_list = []
hyperdata_path = { hyperdata_path = {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment