ISTEX parsing reactivated + stop hyperdata indexation on text for the moment

0234ddbd · Romain Loth · 54976e29 · 0234ddbd · 0234ddbd · 0234ddbd
Commit 0234ddbd authored Apr 08, 2016 by Romain Loth
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 20 deletions

constants.py gargantext/constants.py +14 -14

CSV.py gargantext/util/parsers/CSV.py +2 -2

ISTex.py gargantext/util/parsers/ISTex.py +4 -4

No files found.
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -93,19 +93,19 @@ INDEXED_HYPERDATA = {
        , 'convert_from_db': str
        },

-    'text':
-        { 'id'             : 7
-        , 'type'           : str
-        , 'convert_to_db'  : str
-        , 'convert_from_db': str
-        },
-
-    'page':
-        { 'id'             : 8
-        , 'type'           : int
-        , 'convert_to_db'  : int
-        , 'convert_from_db': int
-        },
+    # 'text':
+    #     { 'id'             : 7
+    #     , 'type'           : str
+    #     , 'convert_to_db'  : str
+    #     , 'convert_from_db': str
+    #     },
+    #
+    # 'page':
+    #     { 'id'             : 8
+    #     , 'type'           : int
+    #     , 'convert_to_db'  : int
+    #     , 'convert_from_db': int
+    #     },

 }

@@ -181,7 +181,7 @@ DEFAULT_TFIDF_CUTOFF_RATIO      = .45        # MAINLIST maximum terms in %
 DEFAULT_TFIDF_HARD_LIMIT        = 750        # MAINLIST maximum terms abs
                                             # (makes COOCS larger ~ O(N²) /!\)

-DEFAULT_COOC_THRESHOLD          = 3          # inclusive minimum for COOCS coefs
+DEFAULT_COOC_THRESHOLD          = 2          # inclusive minimum for COOCS coefs
                                             # (makes COOCS more sparse)

 DEFAULT_MAPLIST_MAX             = 300        # MAPLIST maximum terms

--- a/gargantext/util/parsers/CSV.py
+++ b/gargantext/util/parsers/CSV.py
@@ -18,11 +18,11 @@ class CSVParser(Parser):
        return Freqs


-    def parse(self, filename):
+    def parse(self, filebuf):

        print("CSV: parsing (assuming UTF-8 and LF line endings)")

-        contents = filename.read().decode("UTF-8").split("\n")
+        contents = filebuf.read().decode("UTF-8").split("\n")

        sample_size = 10
        sample_contents = contents[0:sample_size]

--- a/gargantext/util/parsers/ISTex.py
+++ b/gargantext/util/parsers/ISTex.py
@@ -5,10 +5,10 @@ import json

 class ISTexParser(Parser):

-    def parse(self, thefile):
-        json_data=open(thefile,"r")
-        data = json.load(json_data)
-        json_data.close()
+    def parse(self, filebuf):
+        contents = filebuf.read().decode("UTF-8")
+        data = json.loads(contents)
+        filebuf.close()
        json_docs = data["hits"]
        hyperdata_list = []
        hyperdata_path = {