EUROPRESSE

97f70d54 · c24b · cc674dea · 97f70d54 · 97f70d54 · 97f70d54
Commit 97f70d54 authored Aug 25, 2016 by c24b
4 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -145,9 +145,9 @@ def get_tagger(lang):

 RESOURCETYPES = [
    {   "type":1,
-        'name': 'Europress',
-        'format': 'Europress',
-        'parser': "EuropressParser",
+        'name': 'Europresse',
+        'format': 'Europresse',
+        'parser': "EuropresseParser",
        'file_formats':["zip"],
        'crawler': None,
        'default_languages': ['en', 'fr'],

--- a/gargantext/util/parsers/EUROPRESS.py
+++ b/gargantext/util/parsers/EUROPRESS.py
@@ -29,7 +29,7 @@ import sys
 from ._Parser import Parser


-class EuropressParser(Parser):
+class EuropresseParser(Parser):

    def parse(self, file):
        #print("europr_parser file", file)
@@ -266,13 +266,3 @@ class EuropressParser(Parser):

        except:
            print('Something bad happened.')
-
-
-if __name__ == "__main__":
-    e = EuropressFileParser()
-    hyperdata = e.parse(str(sys.argv[1]))
-    for h in hyperdata:
-        try:
-            print(h['journal'], ":", h['publication_date'])
-        except:
-            pass
--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -48,7 +48,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
        documents_count = 0
        source = get_resource(resource["type"])
        #load only the docs that have passed the parsing without error
-        docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.hyperdata["skipped_docs"]]
+
        #load available taggers for default langage of plateform
        #print(LANGUAGES.keys())
        tagger_bots = {lang: load_tagger(lang) for lang in corpus.hyperdata["languages"] \
@@ -56,53 +56,54 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
        supported_taggers_lang = tagger_bots.keys()
        #sort docs by lang?
        # for lang, tagger in tagger_bots.items():
-        for documents_count, document in enumerate(docs):
-
-            language_iso2 = document.hyperdata.get('language_iso2')
-            if language_iso2 not in supported_taggers_lang:
-                #print("ERROR NO language_iso2")
-                document.status("NGRAMS", error="Error: unsupported language for tagging")
-                session.add(document)
-                session.commit()
-                corpus.hyperdata["skipped_docs"].append(document.id)
-                corpus.save_hyperdata()
-                continue
-            else:
-
-                tagger = tagger_bots[language_iso2]
-
-                #print(language_iso2)
-                #>>> romain-stable-patch
-                #to do verify if document has no KEYS to index
-                for key in keys:
-                    try:
-                        value = document.hyperdata[str(key)]
-                        if not isinstance(value, str):
-                            #print("DBG wrong content in doc for key", key)
+        for documents_count, document in enumerate(corpus.children('DOCUMENT')):
+
+            if doc.id not in corpus.hyperdata["skipped_docs"]:
+                language_iso2 = document.hyperdata.get('language_iso2')
+                if language_iso2 not in supported_taggers_lang:
+                    #print("ERROR NO language_iso2")
+                    document.status("NGRAMS", error="Error: unsupported language for tagging")
+                    session.add(document)
+                    session.commit()
+                    corpus.hyperdata["skipped_docs"].append(document.id)
+                    corpus.save_hyperdata()
+                    continue
+                else:
+
+                    tagger = tagger_bots[language_iso2]
+
+                    #print(language_iso2)
+                    #>>> romain-stable-patch
+                    #to do verify if document has no KEYS to index
+                    for key in keys:
+                        try:
+                            value = document.hyperdata[str(key)]
+                            if not isinstance(value, str):
+                                #print("DBG wrong content in doc for key", key)
+                                continue
+                                # get ngrams
+                            for ngram in tagger.extract(value):
+                                tokens = tuple(normalize_forms(token[0]) for token in ngram)
+                                if do_subngrams:
+                                    # ex tokens = ["very", "cool", "exemple"]
+                                    #    subterms = [['very', 'cool'],
+                                    #                ['very', 'cool', 'exemple'],
+                                    #                ['cool', 'exemple']]
+
+                                    subterms = subsequences(tokens)
+                                else:
+                                    subterms = [tokens]
+
+                                for seqterm in subterms:
+                                    ngram = ' '.join(seqterm)
+                                    if len(ngram) > 1:
+                                        # doc <=> ngram index
+                                        nodes_ngrams_count[(document.id, ngram)] += 1
+                                        # add fields :   terms          n
+                                        ngrams_data.add((ngram[:255], len(seqterm), ))
+                        except:
+                            #value not in doc
                            continue
-                            # get ngrams
-                        for ngram in tagger.extract(value):
-                            tokens = tuple(normalize_forms(token[0]) for token in ngram)
-                            if do_subngrams:
-                                # ex tokens = ["very", "cool", "exemple"]
-                                #    subterms = [['very', 'cool'],
-                                #                ['very', 'cool', 'exemple'],
-                                #                ['cool', 'exemple']]
-
-                                subterms = subsequences(tokens)
-                            else:
-                                subterms = [tokens]
-
-                            for seqterm in subterms:
-                                ngram = ' '.join(seqterm)
-                                if len(ngram) > 1:
-                                    # doc <=> ngram index
-                                    nodes_ngrams_count[(document.id, ngram)] += 1
-                                    # add fields :   terms          n
-                                    ngrams_data.add((ngram[:255], len(seqterm), ))
-                    except:
-                        #value not in doc
-                        continue

            # integrate ngrams and nodes-ngrams
            if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:

--- a/gargantext/util/toolchain/parsing.py
+++ b/gargantext/util/toolchain/parsing.py
@@ -164,7 +164,6 @@ def parse(corpus):
    except Exception as error:
        corpus.status('Docs', error=error)
        corpus.save_hyperdata()
-        session.commit()
        raise error