Merge remote-tracking branch 'origin/simon-dev' into dev

2bf705d9 · Alexandre Delanoë · b25deb7b · fbfb5d60 · 2bf705d9 · 2bf705d9
Commit 2bf705d9 authored Jan 18, 2018 by Alexandre Delanoë
Showing with 41 additions and 3 deletions

env.py alembic/env.py +2 -1

db.py gargantext/util/db.py +1 -1

CSV.py gargantext/util/parsers/CSV.py +2 -1

ISI.py gargantext/util/parsers/ISI.py +33 -0

_Parser.py gargantext/util/parsers/_Parser.py +3 -0

No files found.
--- a/alembic/env.py
+++ b/alembic/env.py
@@ -18,7 +18,8 @@ from gargantext import settings, models
 # this is the Alembic Config object, which provides
 # access to the values within the .ini file in use.
 config = context.config
-config.set_main_option("sqlalchemy.url", settings.DATABASES['default']['URL'])
+config.set_main_option("sqlalchemy.url",
+    settings.DATABASES['default']['SECRET_URL'])
 # Interpret the config file for Python logging.
 # This line sets up loggers basically.

--- a/gargantext/util/db.py
+++ b/gargantext/util/db.py
@@ -10,7 +10,7 @@ from sqlalchemy import delete
 def get_engine():
    from sqlalchemy import create_engine
-    return create_engine( settings.DATABASES['default']['URL']
+    return create_engine( settings.DATABASES['default']['SECRET_URL']
                        , use_native_hstore = True
                        , json_serializer = json_dumps
                        , pool_size=20, max_overflow=0

--- a/gargantext/util/parsers/CSV.py
+++ b/gargantext/util/parsers/CSV.py
@@ -16,7 +16,8 @@ class CSVParser(Parser):
    def parse(self, fp=None):
        fp = fp or self._file
-        df = pandas.read_csv(fp, dtype=object, skip_blank_lines=True, sep=None,
+        df = pandas.read_csv(fp, dtype=object, engine='python',
+                                 skip_blank_lines=True, sep=None,
                                 na_values=[], keep_default_na=False)
        # Return a generator of dictionaries with column labels as keys,

--- a/gargantext/util/parsers/ISI.py
+++ b/gargantext/util/parsers/ISI.py
+import re
 from .RIS import RISParser
@@ -17,3 +19,34 @@ class ISIParser(RISParser):
            "AB":  {"type": "hyperdata", "key": "abstract", "separator": " "},
            "WC":  {"type": "hyperdata", "key": "fields"},
        }
+        _year = re.compile(r'\b\d{4}\b')
+        _season = re.compile(r'\b(SPR|SUM|FAL|WIN)\b', re.I)
+        _month_interval = re.compile(r'\b([A-Z]{3})-([A-Z]{3})\b', re.I)
+        _day_interval = re.compile(r'\b(\d{1,2})-(\d{1,2})\b')
+        def _preprocess_PD(self, PD, PY):
+            # Add a year to date if applicable
+            if PY and self._year.search(PY) and not self._year.search(PD):
+                PD = PY + " " + PD
+            # Drop season if any
+            PD = self._season.sub('', PD).strip()
+            # If a month interval is present, keep only the first month
+            PD = self._month_interval.sub(r'\1', PD)
+            # If a day interval is present, keep only the first day
+            PD = self._day_interval.sub(r'\1', PD)
+            return PD
+        def parse(self, file):
+            PD = self._parameters["PD"]["key"]
+            PY = self._parameters["PY"]["key"]
+            for entry in super().parse(file):
+                if PD in entry:
+                    entry[PD] = self._preprocess_PD(entry[PD], entry[PY])
+                yield entry
--- a/gargantext/util/parsers/_Parser.py
+++ b/gargantext/util/parsers/_Parser.py
@@ -86,6 +86,9 @@ class Parser:
            print("WARNING: Date unknown at _Parser level, using now()")
            hyperdata['publication_date'] = datetime.now()
+        # XXX Handling prefixes is most likely useless: there seem to be only
+        #     one prefix which is "publication" (like in "publication_date").
        # ...then parse all the "date" fields, to parse it into separate elements
        prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
        for prefix in prefixes: