Merge remote-tracking branch 'origin/romain-testing-taggers-parsers' into testing

bd7294b2 · delanoe · b98b816d · 08fed238 · bd7294b2 · bd7294b2
Commit bd7294b2 authored Sep 08, 2016 by delanoe
7 changed files
--- a/gargantext/util/parsers/RIS.py
+++ b/gargantext/util/parsers/RIS.py
@@ -7,21 +7,36 @@ class RISParser(Parser):
    _begin = 6
    _parameters = {
        "ER":  {"type": "delimiter"}, # the record delimiter
        "TI":  {"type": "hyperdata", "key": "title", "separator": " "},
+        "T1":  {"type": "hyperdata", "key": "title", "separator": " "},
+        # "T1": variant of TI (if together only last will be kept)
        "ST":  {"type": "hyperdata", "key": "subtitle", "separator": " "},
        "AU":  {"type": "hyperdata", "key": "authors", "separator": "\n"},
+        "JO":  {"type": "hyperdata", "key": "journal"},
        "T2":  {"type": "hyperdata", "key": "journal"},
+        # "T2": variant of JO (if together only last will be kept)
        "UR":  {"type": "hyperdata", "key": "doi"},
        # RIS format specifications: PY is not only year but YYYY/MM/DD with MM and DD optional
        #                            cf. https://en.wikipedia.org/wiki/RIS_(file_format)
        "PY":  {"type": "hyperdata", "key": "publication_year"},
-        "PD":  {"type": "hyperdata", "key": "publication_month"},
-        "N1":  {"type": "hyperdata", "key": "references", "separator": ", "},
+        "N1":  {"type": "hyperdata", "key": "references", "separator": ", "}, # more like notes in reality
        "LA":  {"type": "hyperdata", "key": "language_iso2"},
        "AB":  {"type": "hyperdata", "key": "abstract", "separator": " "},
-        "WC":  {"type": "hyperdata", "key": "fields"},
+        # TODO other interesting fields
+        # "KW"            (keywords)
+        # "A1", "A2"...   (variants of AU)
+        # "N2"            (variant of AB)
+        # previously mentioned here but in fact not in RIS specifications
+        # "PD":  {"type": "hyperdata", "key": "publication_month"},
+        # "WC":  {"type": "hyperdata", "key": "fields"},
    }
    def parse(self, file):
@@ -33,49 +48,66 @@ class RISParser(Parser):
        for line in file:
            # bytes ~~> str
            line = line.decode("UTF-8").rstrip('\r\n')
+            # print("RIS line:", line)
            if len(line) >= 2 :
-                # extract the parameter key...
+                # print("(nonemptyline)")
-                parameter_key = line[:2]
+                # test if key line (otherwise: continuation line)
-                # ...and keep the rest for when we know what to do with it
+                if match(r'[A-Z][A-Z0-9]\s', line):
-                current_value = line[self._begin:]
+                    parameter_key = line[:2]
+                    # print("(matchparamline:"+parameter_key+")")
-                # it's a new key => therefore the previous key is finished
-                if parameter_key != last_key:
+                    # we can now be sure that the value is rest of the line
+                    # (keep it for when we know what to do with it)
-                    if last_key in self._parameters:
+                    current_value = line[self._begin:]
-                        # translate key
-                        parameter = self._parameters[last_key]
+                    # it's a new key => therefore the previous key is finished
-                        # 1 - we record the previous value array...
+                    if parameter_key != last_key:
-                        if parameter["type"] == "hyperdata":
-                            separator = parameter["separator"] if "separator" in parameter else ""
+                        if last_key in self._parameters:
-                            final_value = separator.join(last_values)
+                            # translate key
-                            if last_key != 'PY':
+                            parameter = self._parameters[last_key]
-                                hyperdata[parameter["key"]] = final_value
+                            # 1 - we record the previous value array...
-                            else:
+                            if parameter["type"] == "hyperdata":
-                                hyperdata = PY_values_decompose_and_save(final_value, hyperdata)
+                                separator = parameter["separator"] if "separator" in parameter else ""
+                                final_value = separator.join(last_values)
-                        #... or even finish the record (rare here, most often after empty line)
+                                if last_key != 'PY':
-                        elif parameter["type"] == "delimiter":
+                                    hyperdata[parameter["key"]] = final_value
-                            if 'language_fullname' not in hyperdata.keys():
+                                else:
-                                if 'language_iso3' not in hyperdata.keys():
+                                    hyperdata = PY_values_decompose_and_save(final_value, hyperdata)
-                                    if 'language_iso2' not in hyperdata.keys():
+                                # print("{saved previous"+last_key+"}")
-                                        hyperdata['language_iso2'] = 'en'
-                            yield hyperdata
+                            #... or even finish the record (rare here, most often after empty line)
-                            last_key = None
+                            elif parameter["type"] == "delimiter":
-                            hyperdata = {}
+                                if 'language_fullname' not in hyperdata.keys():
+                                    if 'language_iso3' not in hyperdata.keys():
-                    # 2 - new key: also we start a new value array and move on to the next key
+                                        if 'language_iso2' not in hyperdata.keys():
-                    last_values = []
+                                            hyperdata['language_iso2'] = 'en'
-                    last_key = parameter_key
+                                yield hyperdata
+                                # print("{saved previous record}")
-                # 3 - new key or old: in any case we pass contents to
+                                last_key = None
+                                hyperdata = {}
+                        # 2 - new key: also we start a new value array and move on to the next key
+                        last_values = []
+                        last_key = parameter_key
+                # continuation line: values start from position 0
+                else:
+                    current_value = line
+                    # print("(continuationline)")
+                # 3 - new key or old or no key: in any case we pass contents to
                #     the value array buffer (=> for the next loop only)
                last_values.append(current_value)
                current_value = None
            # empty line => we need to check if PREVIOUS LINE was record delimiter
            else:
+                # print("(emptyline)")
                if last_key in self._parameters:
                    if parameter["type"] == "delimiter":
                        if 'language_fullname' not in hyperdata.keys():
@@ -83,6 +115,7 @@ class RISParser(Parser):
                                if 'language_iso2' not in hyperdata.keys():
                                    hyperdata['language_iso2'] = 'en'
                        yield hyperdata
+                        # print("{saved previous record}")
                        last_key = None
                        hyperdata = {}
            # [end of loop per lines]
@@ -97,6 +130,7 @@ class RISParser(Parser):
                    hyperdata[parameter["key"]] = final_value
                else:
                    hyperdata = PY_values_decompose_and_save(final_value, hyperdata)
+                # print("{saved previous"+last_key+"}")
        # if a hyperdata object is left in memory, yield it as well
        if hyperdata:
@@ -105,8 +139,7 @@ class RISParser(Parser):
                    if 'language_iso2' not in hyperdata.keys():
                        hyperdata['language_iso2'] = 'en'
            yield hyperdata
+            # print("{saved previous record}")

--- a/unittests/mini_test_samples/europresse/dummydocs-europresse-dante-7.html
+++ b/unittests/mini_test_samples/europresse/dummydocs-europresse-dante-7.html
--- a/unittests/mini_test_samples/istex/olddocs-istex-astronomy_before_1910-32.json
+++ b/unittests/mini_test_samples/istex/olddocs-istex-astronomy_before_1910-32.json
--- a/unittests/mini_test_samples/pubmed/dummydocs-pubmed-hugo_shakespeare-10.xml
+++ b/unittests/mini_test_samples/pubmed/dummydocs-pubmed-hugo_shakespeare-10.xml
--- a/unittests/mini_test_samples/zotero/dummydocs-zotero-notices_bidon-12.ris
+++ b/unittests/mini_test_samples/zotero/dummydocs-zotero-notices_bidon-12.ris
--- a/unittests/tests_070_routes.py
+++ b/unittests/tests_070_routes.py
@@ -14,28 +14,9 @@ from gargantext.constants import NODETYPES
 from gargantext.util.db   import session
 class RoutesChecker(TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """
-        Will be run *once* for all tests here
-        NEEDS TO HAVE TestCase.setUpClass()
-        """
-        TestCase.setUpClass()
-        new_project = Node(
-            typename = 'PROJECT',
-            name = "hello i'm a project",
-            user_id = 1                   # todo make sure it's the same user as login
-        )
-        session.add(new_project)
-        session.commit()
-        cls.a_node_id = new_project.id
-        print("created a project with id: %i" % new_project.id)
    def setUp(self):
        """
-        Will be run before *each* test here
+        Will be run before *each* test
        """
        self.client = Client()
@@ -46,6 +27,16 @@ class RoutesChecker(TestCase):
                            )
        # print(response.status_code) # expected: 302 FOUND
+        new_project = Node(
+            typename = 'PROJECT',
+            name = "hello i'm a project",
+            user_id = 1                   # todo make sure it's the same user as login
+        )
+        session.add(new_project)
+        session.commit()
+        self.a_node_id = new_project.id
+        print("created a project with id: %i" % new_project.id)
    def test_071a_get_front_page(self):
        ''' get the front page / '''
        front_response = self.client.get('/')
@@ -78,7 +69,7 @@ class RoutesChecker(TestCase):
    def test_073_get_api_one_node(self):
        ''' get "api/nodes/<node_id>" '''
-        one_node_route = '/api/nodes/%i' % RoutesChecker.a_node_id
+        one_node_route = '/api/nodes/%i' % self.a_node_id
        # print("\ntesting node route: %s" % one_node_route)
        api_response = self.client.get(one_node_route)
        self.assertTrue(api_response.has_header('Content-Type'))

--- a/unittests/tests_090_toolchain.py
+++ b/unittests/tests_090_toolchain.py
@@ -11,19 +11,21 @@ from gargantext.util.db import session
 from gargantext.constants import RESOURCETYPES, NODETYPES, get_resource
 from gargantext.util.toolchain.main import *
-DATA_SAMPLE_DIR = "/srv/gargantext_lib/test_samples/"
+DATA_SAMPLE_DIR = "/srv/gargantext/unittests/mini_test_samples/"
+# todo make it read NDOCS from a json overview to add in DATA_SAMPLE_DIR
 DATA_SAMPLE_NDOCS = [
                        None,     # RESOURCETYPES
-                   [50,4,50],     #  1-europresse
+                         [7],     #  1-europresse
                          [],     #  2-jstor
-                     [81,81],     #  3-pubmed
+                        [10],     #  3-pubmed
-                        [-1],     #  4-scopus
+                          [],     #  4-scopus
-                        [-1],     #  5-web_of_science
+                          [],     #  5-web_of_science
-                        [-1],     #  6-zotero
+                        [12],     #  6-zotero
-                  [837,1000],     #  7-csv
+                          [],     #  7-csv
-                        [-1],     #  8-istex
+                        [32],     #  8-istex
-                      [3,10],     #  9-scoap
+                          [],     #  9-scoap
-                        [-1],     # 10-repec
+                          [],     # 10-repec
                    ]
@@ -121,7 +123,7 @@ class ToolChainRecipes(TestCase):
            self.log.debug("\t- Parsing and indexing corpus")
            parse(self.corpus)
            real_ndocs = self.__count_node_children__(self.corpus, "DOCUMENT")
-            print('==>\t'+str(source_type)+'\t'+str(i)+'\t'+sample_file+'\t'+str(real_ndocs))
+            # print('==>\t'+str(source_type)+'\t'+str(i)+'\t'+sample_file+'\t'+str(real_ndocs))
            self.assertEqual(real_ndocs, expected_ndocs)
            status = self.corpus.status()
            self.log.debug("\t- Extracting ngrams")
@@ -137,29 +139,29 @@ class ToolChainRecipes(TestCase):
        '''testing Europresse parsing'''
        self._run_recipe(1, DATA_SAMPLE_NDOCS[1])
-    def tests_002(self):
+    # def tests_002_jstor(self):
-        self._run_recipe(2, DATA_SAMPLE_NDOCS[2])
+    #     self._run_recipe(2, DATA_SAMPLE_NDOCS[2])
-    def tests_003(self):
+    def tests_003_pubmed(self):
        self._run_recipe(3, DATA_SAMPLE_NDOCS[3])
-    def tests_004(self):
+    # def tests_004_scopus(self):
-        self._run_recipe(4, DATA_SAMPLE_NDOCS[4])
+    #     self._run_recipe(4, DATA_SAMPLE_NDOCS[4])
+    #
-    def tests_005(self):
+    # def tests_005_web_of_science(self):
-        self._run_recipe(5, DATA_SAMPLE_NDOCS[5])
+    #     self._run_recipe(5, DATA_SAMPLE_NDOCS[5])
-    def tests_006(self):
+    def tests_006_zotero(self):
        self._run_recipe(6, DATA_SAMPLE_NDOCS[6])
-    def tests_007(self):
+    # def tests_007_csv(self):
-        self._run_recipe(7, DATA_SAMPLE_NDOCS[7])
+    #     self._run_recipe(7, DATA_SAMPLE_NDOCS[7])
-    def tests_008(self):
+    def tests_008_istex(self):
        self._run_recipe(8, DATA_SAMPLE_NDOCS[8])
-    def tests_009(self):
+    # def tests_009_scoap(self):
-        self._run_recipe(9, DATA_SAMPLE_NDOCS[9])
+    #     self._run_recipe(9, DATA_SAMPLE_NDOCS[9])
+    #
-    def tests_010(self):
+    # def tests_010_repec(self):
-        self._run_recipe(10, DATA_SAMPLE_NDOCS[10])
+    #     self._run_recipe(10, DATA_SAMPLE_NDOCS[10])