load corresponding tagger without ngramsextractors

fa1a268e · c24b · 9fcf90b5 · 9fcf90b5 · fa1a268e
Commit fa1a268e authored Jul 27, 2016 by c24b
Hide whitespace changes
Inline Side-by-side

Showing with 73 additions and 110 deletions

ngramsextractors.py gargantext/util/ngramsextractors.py +0 -45

ngrams_extraction.py gargantext/util/toolchain/ngrams_extraction.py +73 -65

No files found.
--- a/gargantext/util/ngramsextractors.py
+++ b/gargantext/util/ngramsextractors.py
-from gargantext.util.languages import languages
-from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN, RULE_JJNN, RULE_NPN
-import nltk
-import re
-class NgramsExtractor:
-    def __init__(self, tagger):
-        self._tagger = tagger()
-    @staticmethod
-    def clean_text(text):
-        """Clean the text for better POS tagging.
-        For now, only removes (short) XML tags.
-        """
-        return re.sub(r'<[^>]{0,45}>', '', text)
-    def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
-        text = self.clean_text(text)
-        grammar = nltk.RegexpParser(label + ': ' + rule)
-        tagged_tokens = list(self._tagger.tag_text(text))
-        if len(tagged_tokens):
-            grammar_parsed = grammar.parse(tagged_tokens)
-            for subtree in grammar_parsed.subtrees():
-                if subtree.label() == label:
-                    if len(subtree) < max_n_words:
-                        yield subtree.leaves()
-                            # ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
-class NgramsExtractors(dict):
-    def __missing__(self, key):
-        if not isinstance(key, str):
-            raise KeyError
-        if len(key) == 2 and key == key.lower():
-            tagger = LANGUAGES[key]['tagger']
-            self[key] = NgramsExtractor(tagger)
-        else:
-            self[key] = self[LANGUAGES[key].iso3]
-        return self[key]
-# this below will be shared within the current thread
-ngramsextractors = NgramsExtractors()
--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
 from gargantext.util.db import *
 from gargantext.models import *
 from gargantext.constants import *
-from gargantext.util.ngramsextractors import ngramsextractors
+#from gargantext.util.ngramsextractors import ngramsextractors
 from collections import defaultdict
 from re          import sub
@@ -47,75 +47,83 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
        nodes_ngrams_count = defaultdict(int)
        ngrams_data = set()
        # extract ngrams
-        resource_type_index = corpus.resources()[0]['type']
+        resource = corpus.resources()[0]
        documents_count = 0
-        resource_type = RESOURCETYPES[resource_type_index]
+        #load available taggers for source default langage
-        default_language_iso2 = resource_type['default_languages']
+        taggers_bots = {lang: load_tagger(lang) for lang in resource['default_languages']}
+        #skipped documents that have an unsupported languages
+        corpus.skipped_docs = [doc.id for doc in enumerate(corpus.children('DOCUMENT')) if doc.hyperdata["language_iso2"] not in resource["default_languages"]]
+        print(set(corpus.languages.keys()).intersection(resource["default_languages"]))
+        # if lang_doc in corpus.hyperdata['languages']:
+        #     skipped_lg_infos = corpus.hyperdata['languages'].pop(lang_doc)
+        #     corpus.hyperdata['languages']['__skipped__'][lang_doc] = skipped_lg_infos
+        #     corpus.save_hyperdata()
+        #     session.commit()
+        # continue
        for documents_count, document in enumerate(corpus.children('DOCUMENT')):
-            # get ngrams extractor for the current document
+            lang_doc = document.hyperdata['language_iso2']
-            language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
+            if document not in corpus.skipped_docs:
-            try:
+            # if document.id in corpus.skipped_docs:
-                # this looks for a tagger in constants.LANGUAGES
+            #     # get the langage of the current document
-                ngramsextractor = ngramsextractors[language_iso2]
+            #     # skip document
-            except KeyError:
+            #     print('Unsupported language: `%s` (doc #%i)' % (lang_doc, document.id))
-                # skip document
+            #     # and remember that for later processes (eg stemming)
-                print('Unsupported language: `%s` (doc #%i)' % (language_iso2, document.id))
+            #     #document.hyperdata['__skipped__'] = 'ngrams_extraction'
-                # and remember that for later processes (eg stemming)
+            #     #document.save_hyperdata()
-                document.hyperdata['__skipped__'] = 'ngrams_extraction'
+            #     #session.commit()
-                document.save_hyperdata()
+            #     continue
-                session.commit()
+            #
-                if language_iso2 in corpus.hyperdata['languages']:
+            #
-                    skipped_lg_infos = corpus.hyperdata['languages'].pop(language_iso2)
+            # else:
-                    corpus.hyperdata['languages']['__skipped__'][language_iso2] = skipped_lg_infos
+                # extract ngrams on each of the considered keys
+                ngramextractor = taggers_bot[lang_doc]
+                for key in keys:
+                    value = document.hyperdata.get(key, None)
+                    if not isinstance(value, str):
+                        continue
+                    # get ngrams
+                    for ngram in ngramsextractor.extract(value):
+                        tokens = tuple(normalize_forms(token[0]) for token in ngram)
+                        if do_subngrams:
+                            # ex tokens = ["very", "cool", "exemple"]
+                            #    subterms = [['very', 'cool'],
+                            #                ['very', 'cool', 'exemple'],
+                            #                ['cool', 'exemple']]
+                            subterms = subsequences(tokens)
+                        else:
+                            subterms = [tokens]
+                        for seqterm in subterms:
+                            ngram = ' '.join(seqterm)
+                            if len(ngram) > 1:
+                                # doc <=> ngram index
+                                nodes_ngrams_count[(document.id, ngram)] += 1
+                                # add fields :   terms          n
+                                ngrams_data.add((ngram[:255], len(seqterm), ))
+                # integrate ngrams and nodes-ngrams
+                if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
+                    _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
+                    nodes_ngrams_count.clear()
+                    ngrams_data.clear()
+                if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
+                    corpus.status('Ngrams', progress=documents_count+1)
                    corpus.save_hyperdata()
                    session.commit()
-                continue
-            # extract ngrams on each of the considered keys
-            for key in keys:
-                value = document.hyperdata.get(key, None)
-                if not isinstance(value, str):
-                    continue
-                # get ngrams
-                for ngram in ngramsextractor.extract(value):
-                    tokens = tuple(normalize_forms(token[0]) for token in ngram)
-                    if do_subngrams:
-                        # ex tokens = ["very", "cool", "exemple"]
-                        #    subterms = [['very', 'cool'],
-                        #                ['very', 'cool', 'exemple'],
-                        #                ['cool', 'exemple']]
-                        subterms = subsequences(tokens)
-                    else:
-                        subterms = [tokens]
-                    for seqterm in subterms:
-                        ngram = ' '.join(seqterm)
-                        if len(ngram) > 1:
-                            # doc <=> ngram index
-                            nodes_ngrams_count[(document.id, ngram)] += 1
-                            # add fields :   terms          n
-                            ngrams_data.add((ngram[:255], len(seqterm), ))
            # integrate ngrams and nodes-ngrams
-            if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
+            _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
-                _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
+            corpus.status('Ngrams', progress=documents_count+1, complete=True)
-                nodes_ngrams_count.clear()
+            corpus.save_hyperdata()
-                ngrams_data.clear()
+            session.commit()
-            if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
+        except Exception as error:
-                corpus.status('Ngrams', progress=documents_count+1)
+            corpus.status('Ngrams', error=error)
-                corpus.save_hyperdata()
+            corpus.save_hyperdata()
-                session.commit()
+            session.commit()
-        # integrate ngrams and nodes-ngrams
+            raise error
-        _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
-        corpus.status('Ngrams', progress=documents_count+1, complete=True)
-        corpus.save_hyperdata()
-        session.commit()
-    except Exception as error:
-        corpus.status('Ngrams', error=error)
-        corpus.save_hyperdata()
-        session.commit()
-        raise error
 def normalize_forms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):