NAMING convention for PARSER

b5004e99 · c24b · 9c49ac54 · b5004e99 · b5004e99 · 9c49ac54
Commit b5004e99 authored Jul 27, 2016 by c24b
15 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -224,21 +224,24 @@ PARSERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["parser"] is not No
 CRAWLERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["crawler"] is not None]

 def load_parser(resource):
-    '''given a resource load the corresponding Crawler
+    '''given a resource load the corresponding Parser
    resource(dict) > Parser(object)
+    exemple with resource ISTexParser
+    PARSER filename: ISTEX
+    PARSER object: ISTexParser
    '''
-    if resource["parser"] is not None:
-        filename = resource["parser"].replace("Parser", '')
-        print(filename)
-        module = 'gargantext.util.parsers.%s' %(filename)
-        module = importlib.import_module(module)
-        return getattr(module, resource["parser"])
-    else:
-        return None
+    filename = resource["parser"].replace("Parser", '').upper()
+    module = 'gargantext.util.parsers.%s' %(filename)
+    module = importlib.import_module(module)
+    return getattr(module, resource["parser"])
+

 def load_crawler(resource):
    '''given a resource load the corresponding Parser()
    resource(dict) > Parser(object)
+    exemple with resource ISTexCrawler
+    PARSER filename: ISTEX
+    PARSER object: ISTexCrawler
    '''
    filename = resource["name"].replace("Crawler", "")
    module = 'gargantext.util.crawlers.%s' %(filename)

--- a/gargantext/util/parsers/Cern.py
+++ b/gargantext/util/parsers/Cern.py
--- a/gargantext/util/parsers/Csv.py
+++ b/gargantext/util/parsers/Csv.py
-from ._Parser import Parser
-# from ..NgramsExtractors import *
-import sys
-import csv
-csv.field_size_limit(sys.maxsize)
-import numpy as np
-import os
-
-class CSVParser(Parser):
-
-    def CSVsample( self, small_contents , delim) :
-        reader = csv.reader(small_contents, delimiter=delim)
-
-        Freqs = []
-        for row in reader:
-            Freqs.append(len(row))
-
-        return Freqs
-
-
-    def parse(self, filebuf):
-
-        print("CSV: parsing (assuming UTF-8 and LF line endings)")
-
-        contents = filebuf.read().decode("UTF-8").split("\n")
-
-        sample_size = 10
-        sample_contents = contents[0:sample_size]
-
-        hyperdata_list = []
-
-        # # = = = = [ Getting delimiters frequency ] = = = = #
-        PossibleDelimiters = [ ',',' ','\t', ';', '|', ':' ]
-        AllDelimiters = {}
-        for delim in PossibleDelimiters:
-            AllDelimiters[delim] = self.CSVsample( sample_contents , delim )
-        # # = = = = [ / Getting delimiters frequency ] = = = = #
-        # # OUTPUT example:
-        # #  AllDelimiters = {
-        # #   '\t': [1, 1, 1, 1, 1],
-        # #   ' ': [1, 13, 261, 348, 330],
-        # #   ',': [15, 15, 15, 15, 15],
-        # #   ';': [1, 1, 1, 1, 1],
-        # #   '|': [1, 1, 1, 1, 1]
-        # #  }
-
-        # # = = = = [ Stand.Dev=0 & Sum of delimiters ] = = = = #
-        Delimiters = []
-        for d in AllDelimiters:
-            freqs = AllDelimiters[d]
-            suma = np.sum( freqs )
-            if suma >0:
-                std = np.std( freqs )
-                # print [ d , suma , len(freqs) , std]
-                if std == 0:
-                    Delimiters.append ( [ d , suma , len(freqs) , std] )
-        # # = = = = [ / Stand.Dev=0 & Sum of delimiters ] = = = = #
-        # # OUTPUT example:
-        # #  Delimiters = [
-        # #     ['\t', 5, 5, 0.0],
-        # #     [',', 75, 5, 0.0],
-        # #     ['|', 5, 5, 0.0]
-        # #  ]
-
-
-        # # = = = = [ Delimiter selection ] = = = = #
-        Sorted_Delims = sorted(Delimiters, key=lambda x: x[1], reverse=True)
-        HighestDelim = Sorted_Delims[0][0]
-        # HighestDelim = ","
-        print("CSV selected delimiter:",[HighestDelim])
-        # # = = = = [ / Delimiter selection ] = = = = #
-
-
-        # # = = = = [ First data coordinate ] = = = = #
-        Coords = {
-            "row": -1,
-            "column": -1
-        }
-
-        reader = csv.reader(contents, delimiter=HighestDelim)
-
-        for rownum, tokens in enumerate(reader):
-            if rownum % 250 == 0:
-                print("CSV row: ", rownum)
-            joined_tokens = "".join (tokens)
-            if Coords["row"]<0 and len( joined_tokens )>0 :
-                Coords["row"] = rownum
-                for columnum in range(len(tokens)):
-                    t = tokens[columnum]
-                    if len(t)>0:
-                        Coords["column"] = columnum
-                        break
-        # # = = = = [ / First data coordinate ] = = = = #
-
-
-
-        # # = = = = [ Setting Headers ] = = = = #
-        Headers_Int2Str = {}
-        reader = csv.reader(contents, delimiter=HighestDelim)
-        for rownum, tokens in enumerate(reader):
-            if rownum>=Coords["row"]:
-                for columnum in range( Coords["column"],len(tokens) ):
-                    t = tokens[columnum]
-                    Headers_Int2Str[columnum] = t
-                break
-        # print("Headers_Int2Str")
-        # print(Headers_Int2Str)
-        # # = = = = [ / Setting Headers ] = = = = #
-        # # OUTPUT example:
-        # #  Headers_Int2Str = {
-        # #     0: 'publication_date',
-        # #      1: 'publication_month',
-        # #      2: 'publication_second',
-        # #      3: 'abstract'
-        # #  }
-
-
-        # # = = = = [ Reading the whole CSV and saving ] = = = = #
-        hyperdata_list = []
-        reader = csv.reader(contents, delimiter=HighestDelim)
-        for rownum, tokens in enumerate(reader):
-            if rownum>Coords["row"]:
-                RecordDict = {}
-                for columnum in range( Coords["column"],len(tokens) ):
-                    data = tokens[columnum]
-                    RecordDict[ Headers_Int2Str[columnum] ] = data
-                if len(RecordDict.keys())>0:
-                    hyperdata_list.append( RecordDict )
-        # # = = = = [ / Reading the whole CSV and saving ] = = = = #
-
-        return hyperdata_list
--- a/gargantext/util/parsers/Europress.py
+++ b/gargantext/util/parsers/Europress.py
--- a/gargantext/util/parsers/Isi.py
+++ b/gargantext/util/parsers/Isi.py
--- a/gargantext/util/parsers/ISTex.py
+++ b/gargantext/util/parsers/ISTex.py
--- a/gargantext/util/parsers/Istex.py
+++ b/gargantext/util/parsers/Istex.py
-from ._Parser import Parser
-from datetime import datetime
-from io import BytesIO
-import json
-
-class ISTexParser(Parser):
-
-    def parse(self, filebuf):
-        contents = filebuf.read().decode("UTF-8")
-        data = json.loads(contents)
-        filebuf.close()
-        json_docs = data["hits"]
-        hyperdata_list = []
-        hyperdata_path = {
-            "id"                : "id",
-            "source"           : 'corpusName',
-            "title"             : 'title',
-            "genre"             : "genre",
-            "language_iso3"     : 'language',
-            "doi"               : 'doi',
-            "host"              : 'host',
-            "publication_date"  : 'publicationDate',
-            "abstract"  : 'abstract',
-            # "authors"           : 'author',
-            "authorsRAW"        : 'author',
-            "keywords"          : "keywords"
-        }
-
-        suma = 0
-
-        for json_doc in json_docs:
-            hyperdata = {}
-            for key, path in hyperdata_path.items():
-                try:
-                    # print(path," ==> ",len(json_doc[path]))
-                    hyperdata[key] = json_doc[path]
-                except:
-                    pass
-
-            # print("|",hyperdata["language_iso3"])
-
-            if "doi" in hyperdata:
-                hyperdata["doi"] = hyperdata["doi"][0]
-
-            keywords = []
-            if "keywords" in hyperdata:
-                for keyw in hyperdata["keywords"]:
-                    keywords.append(keyw["value"] )
-                hyperdata["keywords"] = ", ".join( keywords )
-
-            moredate=False
-            moresource=False
-            if "host" in hyperdata:
-
-                if "genre" in hyperdata["host"] and len(hyperdata["host"]["genre"])>0:
-                    if "genre" in hyperdata and len(hyperdata["genre"])==0:
-                        hyperdata["genre"] = hyperdata["host"]["genre"]
-
-                # print(hyperdata["host"])
-                if "pubdate" in hyperdata["host"]:
-                    onebuffer = hyperdata["publication_date"]
-                    hyperdata["publication_date"] = []
-                    hyperdata["publication_date"].append(onebuffer)
-                    hyperdata["publication_date"].append( hyperdata["host"]["pubdate"] )
-
-                if "title" in hyperdata["host"]:
-                    hyperdata["journal"] = hyperdata["host"]["title"]
-
-            authors=False
-            if "authorsRAW" in hyperdata:
-                names = []
-                for author in hyperdata["authorsRAW"]:
-                    names.append(author["name"])
-                hyperdata["authors"] = ", ".join(names)
-
-            if "host" in hyperdata: hyperdata.pop("host")
-            if "genre" in hyperdata:
-                if len(hyperdata["genre"])==0:
-                    hyperdata.pop("genre")
-            if "language_iso3" in hyperdata:
-                # retrieve lang if lang != [] and lang != ["unknown"]
-                # ---------------------------------------------------
-                if len(hyperdata["language_iso3"])>0 and hyperdata["language_iso3"][0] != "unknown" :
-                    hyperdata["language_iso3"] = hyperdata["language_iso3"][0]
-
-                # default value = eng
-                # possible even better: langid.classify(abstract)
-                else:
-                    # NB 97% des docs istex sont eng donc par défaut
-                    # ----------------------------------------------
-                    hyperdata["language_iso3"] = "eng"
-                    # (cf. api.istex.fr/document/?q=*&facet=language
-                    #  et  tests langid sur les language=["unknown"])
-
-
-            if "publication_date" in hyperdata:
-                RealDate = hyperdata["publication_date"]
-                if "publication_date" in hyperdata:
-                    hyperdata.pop("publication_date")
-
-                if isinstance(RealDate, list):
-                    RealDate = RealDate[0]
-
-                # print( RealDate ," | length:",len(RealDate))
-                Decision=""
-                if len(RealDate)>4:
-                    if len(RealDate)>8:
-                        try: Decision = datetime.strptime(RealDate, '%Y-%b-%d').date()
-                        except:
-                            try: Decision = datetime.strptime(RealDate, '%Y-%m-%d').date()
-                            except: Decision=False
-                    else:
-                        try: Decision = datetime.strptime(RealDate, '%Y-%b').date()
-                        except:
-                            try: Decision = datetime.strptime(RealDate, '%Y-%m').date()
-                            except: Decision=False
-                else:
-                    try: Decision = datetime.strptime(RealDate, '%Y').date()
-                    except: Decision=False
-
-                if Decision!=False:
-                    hyperdata["publication_year"] = str(Decision.year)
-                    hyperdata["publication_month"] = str(Decision.month)
-                    hyperdata["publication_day"] = str(Decision.day)
-                    hyperdata_list.append(hyperdata)
-                    # print("\t||",hyperdata["title"])
-                    # print("\t\t",Decision)
-                    # print("=============================")
-                # else:
-                #     suma+=1
-                #     if "pubdate" in json_doc:
-                #         print ("\tfail pubdate:",json_doc["pubdate"])
-
-
-        # print ("nb_hits:",len(json_docs))
-        # print("\t - nb_fails:",suma)
-        # print("  -- - - - - - -- - -")
-
-        return hyperdata_list
--- a/gargantext/util/parsers/Pubmed.py
+++ b/gargantext/util/parsers/Pubmed.py
--- a/gargantext/util/parsers/Repec.py
+++ b/gargantext/util/parsers/Repec.py
--- a/gargantext/util/parsers/Ris.py
+++ b/gargantext/util/parsers/Ris.py
--- a/gargantext/util/parsers/Ris_repec.py
+++ b/gargantext/util/parsers/Ris_repec.py
-from ._Parser import Parser
-
-from gargantext.util.languages import languages
-
-#from admin.utils import PrintException
-
-class RepecParser(Parser):
-
-#    def __init__(self, language_cache=None):
-#
-#        #super(Parser, self).__init__()
-#        #super(Parser, self).__init__()
-#        self._languages_cache = LanguagesCache() if language_cache is None else language_cache
-
-
-    _begin = 6
-    _parameters = {
-        b"ER":  {"type": "delimiter"},
-        b"T1":  {"type": "hyperdata", "key": "title", "separator": " "},
-        b"ST":  {"type": "hyperdata", "key": "subtitle", "separator": " "},
-        b"A1":  {"type": "hyperdata", "key": "authors", "separator": "\n"},
-        b"JO":  {"type": "hyperdata", "key": "journal"},
-        b"UR":  {"type": "hyperdata", "key": "doi"},
-        b"Y1":  {"type": "hyperdata", "key": "publication_year"},
-        b"PD":  {"type": "hyperdata", "key": "publication_month"},
-        b"N1":  {"type": "hyperdata", "key": "references", "separator": ", "},
-        b"LA":  {"type": "hyperdata", "key": "language_iso2"},
-        b"N2":  {"type": "hyperdata", "key": "abstract", "separator": " "},
-        b"WC":  {"type": "hyperdata", "key": "fields"},
-    }
-
-    def parse(self, file):
-
-        hyperdata = {}
-        last_key = None
-        last_values = []
-        # browse every line of the file
-        for line in file:
-            if len(line) > 2 :
-                # extract the parameter key
-                parameter_key = line[:2]
-                if parameter_key != b'  ' and parameter_key != last_key:
-                    if last_key in self._parameters:
-                        # translate the parameter key
-                        parameter = self._parameters[last_key]
-                        if parameter["type"] == "hyperdata":
-                            separator = parameter["separator"] if "separator" in parameter else ""
-                            if parameter["key"] == "publication_year":
-                                hyperdata[parameter["key"]] = separator.join(last_values)[:4]
-                            else:
-                                hyperdata[parameter["key"]] = separator.join(last_values)
-                        elif parameter["type"] == "delimiter":
-                            if 'language_fullname' not in hyperdata.keys():
-                                if 'language_iso3' not in hyperdata.keys():
-                                    if 'language_iso2' not in hyperdata.keys():
-                                        hyperdata['language_iso2'] = 'en'
-                            yield hyperdata
-                            hyperdata = {}
-                    last_key = parameter_key
-                    last_values = []
-                try:
-                    last_values.append(line[self._begin:-1].decode())
-                except Exception as error:
-                    print(error)
-        # if a hyperdata object is left in memory, yield it as well
-        if hyperdata:
-            yield hyperdata
--- a/gargantext/util/taggers/_Tagger.py
+++ b/gargantext/util/taggers/_Tagger.py
@@ -3,7 +3,7 @@ When started, it initiates the parser;
 when passed text, the text is piped to the parser.
 When ended, the parser is closed and the tagged word returned as a tuple.
 """
-from constants import RULE_JJNN, DEFAULT_MAX_NGRAM_LEN
+from gargantext.constants import RULE_JJNN, DEFAULT_MAX_NGRAM_LEN
 import re
 import nltk

@@ -50,6 +50,7 @@ class Tagger:
        This method is called by the constructor, and can be overriden by
        inherited classes.
        """
+        print("START")
        self.extract(self.text)

    def stop(self):

--- a/gargantext/util/toolchain/ngram_groups.py
+++ b/gargantext/util/toolchain/ngram_groups.py
@@ -26,10 +26,11 @@ def prepare_stemmers(corpus):
        # always get a generic stemmer in case language code unknown
        '__unknown__' : SnowballStemmer("english")
    }
-    for lgiso2 in corpus.hyperdata['languages'].keys():
-        if (lgiso2 != '__skipped__'):
-            lgname = languages[lgiso2].name.lower()
-            stemmers_by_lg[lgiso2] = SnowballStemmer(lgname)
+    for lang in corpus.hyperdata["languages"].keys():
+        print(lang)
+        if (lang != '__skipped__'):
+            lgname = languages[lang].name.lower()
+            stemmers_by_lg[lang] = SnowballStemmer(lgname)
    return stemmers_by_lg

 def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
@@ -38,6 +39,7 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
    2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
    3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2)
    """
+    print(corpus.languages.keys())

    stop_ngrams_ids = {}
    # we will need the ngrams of the stoplist to filter

--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -53,19 +53,21 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
        #load available taggers for source default langage
        tagger_bots = {lang: load_tagger(lang) for lang in source['default_languages']}
        #skipped documents that have been skipped previously for parsing error or unsupported language
+        print(corpus.skipped_docs)
        docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.skipped_docs]
+        #sort docs by lang?
+        docs = sorted(docs, key= lambda k: k.language_iso2)
        #print(corpus.hyperdata["languages"])
        for documents_count, document in enumerate(docs):
-            lang_doc = document.hyperdata['language_iso2']
-            ngramextractor = tagger_bots[lang_doc]
+            lang_doc = document.language_iso2
+            print(lang_doc)
            for key in keys:
                value = document.hyperdata.get(key, None)
                if not isinstance(value, str):
                    continue
                    # get ngrams
-                for ngram in ngramsextractor.extract(value):
+                for ngram in tagger_bots[lang_doc](value):
                    tokens = tuple(normalize_forms(token[0]) for token in ngram)
-
                    if do_subngrams:
                        # ex tokens = ["very", "cool", "exemple"]
                        #    subterms = [['very', 'cool'],
@@ -93,11 +95,12 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
                corpus.status('Ngrams', progress=documents_count+1)
                corpus.save_hyperdata()
                session.commit()
-        # integrate ngrams and nodes-ngrams
-        _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
-        corpus.status('Ngrams', progress=documents_count+1, complete=True)
-        corpus.save_hyperdata()
-        session.commit()
+            else:
+                # integrate ngrams and nodes-ngrams
+                _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
+            corpus.status('Ngrams', progress=documents_count+1, complete=True)
+            corpus.save_hyperdata()
+            session.commit()
    except Exception as error:
        corpus.status('Ngrams', error=error)
        corpus.save_hyperdata()

--- a/gargantext/util/toolchain/parsing.py
+++ b/gargantext/util/toolchain/parsing.py
@@ -9,24 +9,31 @@ def parse(corpus):
    try:
        documents_count = 0
        corpus.status('Docs', progress=0)
+        #print(corpus.resources())
        #get the sources capabilities for a given corpus
-        sources = [get_resource(resource["type"]) for resource in corpus.resources() if not 'extracted' in resource.keys() ]
+        resource = corpus.resources()[0]
+        print(resource)
+        sources = [get_resource(resource["type"]) for resource in corpus.resources()]
+        print(sources)
        if len(sources) == 0:
            #>>> documents have already been parsed?????
-            return
+            raise ValueError(len(sources))
        if len(sources) > 0:
            #>>> necessairement 1 corpus = 1 source dans l'archi actuelle
            source = sources[0]
-            if resource["parser"] is None:
+            if source["parser"] is None:
                #corpus.status(error)
                raise ValueError("Resource '%s' has no Parser" %resource["name"])
            else:
-                corpus.languages = defaultdict.from_keys(sources[0]["default_languages"], 0)
+                corpus.languages = defaultdict.fromkeys(sources[0]["default_languages"], 0)
                corpus.skipped_docs = []
+                session.add(corpus)
+                session.commit()
                #load the corresponding parser
                resource_parser = load_parser(source)
                skipped_languages = []
                # extract and insert documents from resource.path into database
+                print(resource)
                for hyperdata in resource_parser(resource["path"]):
                    # indexed text fields defined in constants
                    for k in DEFAULT_INDEX_FIELDS:
@@ -39,7 +46,7 @@ def parse(corpus):
                    # a simple census to raise language info at corpus level
                    if "language_iso2" in hyperdata.keys():
                        try:
-                            corpus.languages[hyperdata["language_iso2"]] += 1
+                            corpus.hyperdata["languages"][hyperdata["language_iso2"]] += 1
                        except KeyError:
                            hyperdata["error"] = "Error: unsupported language"
                            skipped_languages.append(hyperdata["language_iso2"])
@@ -47,8 +54,6 @@ def parse(corpus):
                        hyperdata["error"] = "Error: no language found"
                # save as DB child
                # ----------------
-
-
                document = corpus.add_child(
                    typename = 'DOCUMENT',
                    name = hyperdata.get('title', '')[:255],
@@ -60,21 +65,21 @@ def parse(corpus):
                    document.status('Parsing', error= document.hyperdata["error"])
                    #session.delete(document)
                    corpus.skipped_docs.append(document.id)
-
-
                # logging
                if documents_count % BATCH_PARSING_SIZE == 0:
                    corpus.status('Docs', progress=documents_count)
                    corpus.save_hyperdata()
+                    session.add(corpus)
                    session.commit()
                documents_count += 1
            # update info about the resource
            resource['extracted'] = True
        # add a corpus-level info about languages adding a __skipped__ info
-        corpus.hyperdata['languages']['__skipped__'] = Counter(skipped_languages)
+        corpus.languages['__skipped__'] = Counter(skipped_languages)
        # commit all changes
        corpus.status('Docs', progress=documents_count, complete=True)
        corpus.save_hyperdata()
+        session.add(corpus)
        session.commit()
    except Exception as error:
        corpus.status('Docs', error=error)