load corresponding tagger without ngramsextractors

fa1a268e · c24b · 9fcf90b5 · 9fcf90b5 · fa1a268e
Commit fa1a268e authored Jul 27, 2016 by c24b
Show whitespace changes
Inline Side-by-side

Showing with 73 additions and 110 deletions

ngramsextractors.py gargantext/util/ngramsextractors.py +0 -45

ngrams_extraction.py gargantext/util/toolchain/ngrams_extraction.py +73 -65

No files found.
--- a/gargantext/util/ngramsextractors.py
+++ b/gargantext/util/ngramsextractors.py
-from gargantext.util.languages import languages
-from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN, RULE_JJNN, RULE_NPN
-import nltk
-import re
-class NgramsExtractor:
-    def __init__(self, tagger):
-        self._tagger = tagger()
-    @staticmethod
-    def clean_text(text):
-        """Clean the text for better POS tagging.
-        For now, only removes (short) XML tags.
-        """
-        return re.sub(r'<[^>]{0,45}>', '', text)
-    def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
-        text = self.clean_text(text)
-        grammar = nltk.RegexpParser(label + ': ' + rule)
-        tagged_tokens = list(self._tagger.tag_text(text))
-        if len(tagged_tokens):
-            grammar_parsed = grammar.parse(tagged_tokens)
-            for subtree in grammar_parsed.subtrees():
-                if subtree.label() == label:
-                    if len(subtree) < max_n_words:
-                        yield subtree.leaves()
-                            # ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
-class NgramsExtractors(dict):
-    def __missing__(self, key):
-        if not isinstance(key, str):
-            raise KeyError
-        if len(key) == 2 and key == key.lower():
-            tagger = LANGUAGES[key]['tagger']
-            self[key] = NgramsExtractor(tagger)
-        else:
-            self[key] = self[LANGUAGES[key].iso3]
-        return self[key]
-# this below will be shared within the current thread
-ngramsextractors = NgramsExtractors()
--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
 from gargantext.util.db import *
 from gargantext.models import *
 from gargantext.constants import *
-from gargantext.util.ngramsextractors import ngramsextractors
+#from gargantext.util.ngramsextractors import ngramsextractors
 from collections import defaultdict
 from re          import sub
@@ -47,30 +47,38 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
        nodes_ngrams_count = defaultdict(int)
        ngrams_data = set()
        # extract ngrams
-        resource_type_index = corpus.resources()[0]['type']
+        resource = corpus.resources()[0]
        documents_count = 0
-        resource_type = RESOURCETYPES[resource_type_index]
+        #load available taggers for source default langage
-        default_language_iso2 = resource_type['default_languages']
+        taggers_bots = {lang: load_tagger(lang) for lang in resource['default_languages']}
+        #skipped documents that have an unsupported languages
+        corpus.skipped_docs = [doc.id for doc in enumerate(corpus.children('DOCUMENT')) if doc.hyperdata["language_iso2"] not in resource["default_languages"]]
+        print(set(corpus.languages.keys()).intersection(resource["default_languages"]))
+        # if lang_doc in corpus.hyperdata['languages']:
+        #     skipped_lg_infos = corpus.hyperdata['languages'].pop(lang_doc)
+        #     corpus.hyperdata['languages']['__skipped__'][lang_doc] = skipped_lg_infos
+        #     corpus.save_hyperdata()
+        #     session.commit()
+        # continue
        for documents_count, document in enumerate(corpus.children('DOCUMENT')):
-            # get ngrams extractor for the current document
+            lang_doc = document.hyperdata['language_iso2']
-            language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
+            if document not in corpus.skipped_docs:
-            try:
+            # if document.id in corpus.skipped_docs:
-                # this looks for a tagger in constants.LANGUAGES
+            #     # get the langage of the current document
-                ngramsextractor = ngramsextractors[language_iso2]
+            #     # skip document
-            except KeyError:
+            #     print('Unsupported language: `%s` (doc #%i)' % (lang_doc, document.id))
-                # skip document
+            #     # and remember that for later processes (eg stemming)
-                print('Unsupported language: `%s` (doc #%i)' % (language_iso2, document.id))
+            #     #document.hyperdata['__skipped__'] = 'ngrams_extraction'
-                # and remember that for later processes (eg stemming)
+            #     #document.save_hyperdata()
-                document.hyperdata['__skipped__'] = 'ngrams_extraction'
+            #     #session.commit()
-                document.save_hyperdata()
+            #     continue
-                session.commit()
+            #
-                if language_iso2 in corpus.hyperdata['languages']:
+            #
-                    skipped_lg_infos = corpus.hyperdata['languages'].pop(language_iso2)
+            # else:
-                    corpus.hyperdata['languages']['__skipped__'][language_iso2] = skipped_lg_infos
-                    corpus.save_hyperdata()
-                    session.commit()
-                continue
                # extract ngrams on each of the considered keys
+                ngramextractor = taggers_bot[lang_doc]
                for key in keys:
                    value = document.hyperdata.get(key, None)
                    if not isinstance(value, str):