Revert taggers modifications

bd714567 · c24b · e34b48b5 · bd714567 · bd714567 · bd714567
Commit bd714567 authored Aug 26, 2016 by c24b
6 changed files
--- a/gargantext/util/taggers/NltkTagger.py
+++ b/gargantext/util/taggers/NltkTagger.py
@@ -8,22 +8,10 @@ class NltkTagger(Tagger):
    #import nltk
    def __init__(self, *args, **kwargs):
        self.tagr =  PerceptronTagger()
-        #super(self.__class__, self).__init__(*args, **kwargs)
+        super(self.__class__, self).__init__(*args, **kwargs)

-    #def __start__(self):
+    #~ def __start__(self):
        #~ self.tagr =  PerceptronTagger()

    def tag_tokens(self, tokens, single=True):
        return self.tagr.tag(tokens)
-
-    # def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
-    #     self.text = self.clean_text(text)
-    #     grammar = nltk.RegexpParser(label + ': ' + rule)
-    #     tagged_tokens = list(self.tag_text(self.text))
-    #     if len(tagged_tokens):
-    #         grammar_parsed = grammar.parse(tagged_tokens)
-    #         for subtree in grammar_parsed.subtrees():
-    #             if subtree.label() == label:
-    #                 if len(subtree) < max_n_words:
-    #                     yield subtree.leaves()
-    #                         # ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
--- a/gargantext/util/taggers/TreeTagger.py
+++ b/gargantext/util/taggers/TreeTagger.py
@@ -74,7 +74,6 @@ class TreeTagger(Tagger):
        self._input, self._output = self._popen.stdin, self._popen.stdout
        # self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start()
        # self.buffer = OutputBuffer()
-        # self.extract(self.text)

    def stop(self):
        # terminates the 'treetagger' process

--- a/gargantext/util/taggers/TurboTagger.py
+++ b/gargantext/util/taggers/TurboTagger.py
@@ -6,13 +6,12 @@ class TurboTagger:

    def start(self):
        self._nlpclient = NLPClient()
-        #self.extract(self.text)

    def stop(self):
        if hasattr(self, '_nlpclient'):
            del self._nlpclient

-    def extract(self, text):
+    def tag_text(self, text):
        if not hasattr(self, '_nlpclient'):
            self._nlpclient = NLPClient()
        try:

--- a/gargantext/util/taggers/_Tagger.py
+++ b/gargantext/util/taggers/_Tagger.py
@@ -19,7 +19,8 @@ class Tagger:
            | [][.,;"'?!():-_`]             # these are separate tokens
            ''', re.UNICODE | re.MULTILINE | re.DOTALL)
        self.buffer = []
-        self.start()
+
+        #self.start()


    def clean_text(self, text):

--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -9,7 +9,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
    """
    @param ngrams_data   a set like {('single word', 2), ('apple', 1),...}
    """
-    #print('INTEGRATE')
+    print('INTEGRATE')
    # integrate ngrams
    ngrams_ids = bulk_insert_ifnotexists(
        model = Ngram,
@@ -59,7 +59,6 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
        #sort docs by lang?
        # for lang, tagger in tagger_bots.items():
        for documents_count, document in enumerate(corpus.children('DOCUMENT')):
-
            if document.id not in corpus.hyperdata["skipped_docs"]:
                language_iso2 = document.hyperdata.get('language_iso2')
                if language_iso2 not in supported_taggers_lang:
@@ -73,7 +72,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
                else:

                    tagger = tagger_bots[language_iso2]
-
+                    print(tagger)
                    #print(language_iso2)
                    #>>> romain-stable-patch
                    #to do verify if document has no KEYS to index
@@ -109,10 +108,11 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND

            # integrate ngrams and nodes-ngrams
            if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
+                print(len(nodes_ngrams_count),">=", BATCH_NGRAMSEXTRACTION_SIZE)
                _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
                nodes_ngrams_count.clear()
                ngrams_data.clear()
-            if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
+            if documents_count % BATCH_PARSING_SIZE == 0:
                corpus.status('Ngrams', progress=documents_count+1)
                corpus.save_hyperdata()
                session.add(corpus)

--- a/gargantext/util/toolchain/parsing.py
+++ b/gargantext/util/toolchain/parsing.py
@@ -99,15 +99,14 @@ def parse(corpus):
        #skipped docs to remember for later processing
        skipped_docs = []

-
+        documents_count = 0
        #BY RESOURCE
        for i,resource in enumerate(resources):
            if resource["extracted"] is True:
                continue
            else:
                # BY documents
-                d = 0
-                for documents_count, hyperdata in enumerate(parserbot(resource["path"])):
+                for hyperdata in parserbot(resource["path"]):
                    # indexed text fields defined in CONSTANTS
                    for k in DEFAULT_INDEX_FIELDS:
                        if k in hyperdata.keys():
@@ -126,32 +125,45 @@ def parse(corpus):
                        name = hyperdata.get('title', '')[:255],
                        hyperdata = hyperdata,
                    )
-                    #corpus.save_hyperdata()
-                    # session.add(document)
-                    # session.commit()
+                    session.add(document)
+                    session.commit()

                    if "error" in hyperdata.keys():
                        #document.status("error")
-                        document.status('Parsing', error= document.hyperdata["error"])
-                        document.save_hyperdata()
-
+                        #document.status('Parsing', error= document.hyperdata["error"])
+                        #document.save_hyperdata()
+                        #session.add(document)
+                        #session.commit()

                        #adding skipped_docs for later processsing if error in parsing
                        skipped_docs.append(document.id)

+                    #BATCH_PARSING_SIZE
                    if documents_count % BATCH_PARSING_SIZE == 0:
                        corpus.status('Docs', progress=documents_count)
                        corpus.save_hyperdata()
-                        #session.add(corpus)
-                        #session.commit()
+                        session.add(corpus)
+                        session.commit()

+                    documents_count += 1

                # update info about the resource
                resource['extracted'] = True
                #print( "resource n°",i, ":", d, "docs inside this file")
-            #finally store documents for this corpus
-            session.add(corpus)
-            session.commit()
+        #finally store documents for this corpus
+        corpus.status('Parsing', progress=documents_count+1, complete=True)
+        #corpus.status('Parsing', complete =True)
+        corpus.save_hyperdata()
+        #session.add(corpus)
+        #session.commit()
+        #adding parsing error to document level
+        for node_id in skipped_docs:
+            node = session.query(Node).filter(Node.id== node_id).first()
+            node.status("Parsing", "Error in parsing")
+            node.save_hyperdata()
+            #session.flush()
+        #skipped_nodes = session.query(Node).filter(Node.id.in_(skipped_docs)).all()
+        #mods = [node.status('Parsing', "Error in parsing:skipped") for node in skipped_nodes]

        #STORING AGREGATIONS INFO (STATS)
        #skipped_docs