parsing+extraction: removing old debug messages

e8d5e001 · Romain Loth · 5ce424f9 · e8d5e001 · e8d5e001 · e8d5e001
Commit e8d5e001 authored Sep 05, 2016 by Romain Loth
5 changed files
--- a/gargantext/util/parsers/EUROPRESSE.py
+++ b/gargantext/util/parsers/EUROPRESSE.py
@@ -115,8 +115,6 @@ class EuropresseParser(Parser):
        # parse all the articles, one by one
        for html_article in html_articles:
            try:
-                print("==============================new article")
-
                # s'il n'y a pas du tout de header on doit skip
                all_header = html_article.xpath(entire_header_xpath)
                all_header_text = " ".join(scrap_text(all_header))

--- a/gargantext/util/parsers/ISTEX.py
+++ b/gargantext/util/parsers/ISTEX.py
@@ -27,7 +27,6 @@ class ISTexParser(Parser):
        }

        suma = 0
-        print(len(json_docs))
        for json_doc in json_docs:

            hyperdata = {}

--- a/gargantext/util/taggers/_Tagger.py
+++ b/gargantext/util/taggers/_Tagger.py
@@ -32,7 +32,7 @@ class Tagger:
        self.text = self.clean_text(text)
        grammar = nltk.RegexpParser(label + ': ' + rule)
        tagged_tokens = list(self.tag_text(self.text))
-        print("the tagged_tokens", tagged_tokens)
+        # print("the tagged_tokens", tagged_tokens)
        if len(tagged_tokens):
            grammar_parsed = grammar.parse(tagged_tokens)
            for subtree in grammar_parsed.subtrees():

--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -11,7 +11,8 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):

    £TODO: load whole word dictionary in ram and check existence before inserting to db => sequential insert => probably faster!
    """
-    print('INTEGRATE', len(ngrams_data), len(nodes_ngrams_count))
+    # print('INTEGRATE', len(ngrams_data), len(nodes_ngrams_count))
+    print('INTEGRATE')
    # integrate ngrams (aka new words)
    ngrams_ids = bulk_insert_ifnotexists(
        model = Ngram,                # todo type should :str ~~> :str|:re) !!!
@@ -118,7 +119,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND

            # integrate ngrams and nodes-ngrams
            if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
-                print(len(nodes_ngrams_count),">=", BATCH_NGRAMSEXTRACTION_SIZE)
+                # print(len(nodes_ngrams_count),">=", BATCH_NGRAMSEXTRACTION_SIZE)
                _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
                nodes_ngrams_count.clear()
                ngrams_data.clear()

--- a/gargantext/util/toolchain/parsing.py
+++ b/gargantext/util/toolchain/parsing.py
@@ -68,7 +68,8 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
            lang_result['skipped'].append(hyperdata["language_name"])

    else:
-        print("[WARNING] no language_* found in document [parsing.py]")
+        print("WARNING no language_* found in document [parsing.py] => "
+               + ("(detecting)" if DETECT_LANG else "(using default)"))

        if DETECT_LANG:
            #no language have been indexed
@@ -93,7 +94,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
 def parse(corpus):
    try:
        print("PARSING")
-        print("DETECT_LANG?", DETECT_LANG)
+        # print("DETECT_LANG?", DETECT_LANG)
        corpus.status('Docs', progress=0)
        #1 corpus => 1 or multi resources.path (for crawlers)
        resources = corpus.resources()
@@ -107,7 +108,9 @@ def parse(corpus):
            #corpus.status(error)
            raise ValueError("Resource '%s' has no Parser" %resource["name"])
        parserbot = load_parser(source)
-        print(parserbot)
+
+        # print(parserbot)
+
        #observed languages in default languages
        observed_languages = []
        #skipped_languages
@@ -218,10 +221,10 @@ def parse(corpus):
        #les jolis iso2
        observed_langs = dict(Counter(observed_languages))

-        print("#LANGAGES OK")
-        print(observed_langs)
-        print("#LANGUAGES UNKNOWN")
-        print(skipped_langs)
+        # print("#LANGAGES OK")
+        # print(observed_langs)
+        # print("#LANGUAGES UNKNOWN")
+        # print(skipped_langs)

        top_langs = sorted(observed_langs.items(), key = lambda x: x[1], reverse=True)
        if len(top_langs) > 0: