LANGDETECT set to TRUE

401ab95a · c24b · 47d38d27 · 401ab95a · 401ab95a · 401ab95a
Commit 401ab95a authored Aug 26, 2016 by c24b
Showing with 26 additions and 4 deletions

constants.py gargantext/constants.py +1 -1

languages.py gargantext/util/languages.py +11 -0

timeit_damnit.py gargantext/util/timeit_damnit.py +10 -0

parsing.py gargantext/util/toolchain/parsing.py +4 -3

No files found.
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -313,7 +313,7 @@ DEFAULT_INDEX_SUBGRAMS         = False        # False <=> traditional
                                             #    at indexing after extraction)
 # TAGGING options   -----------------------------------------
 #activate lang detection?
-DETECT_LANG = False
+DETECT_LANG = True
 # Defaults INDEXED Fields for ngrams extraction
 # put longest field first in order to make detection language more efficient
 DEFAULT_INDEX_FIELDS            = ('abstract','title' )

--- a/gargantext/util/languages.py
+++ b/gargantext/util/languages.py
 from gargantext.constants import *
 from langdetect import detect, DetectorFactory
+import time
+
+def timing(f):
+    def wrap(*args):
+        time1 = time.time()
+        ret = f(*args)
+        time2 = time.time()
+        print ('function took %0.3f ms' %((time2-time1)*1000.0))
+        return ret
+    return wrap



@@ -28,6 +38,7 @@ class Languages(dict):

 languages = Languages()

+@timing
 def detect_lang(text):
    DetectorFactory.seed = 0
    return languages[detect(text)].iso2

--- a/gargantext/util/timeit_damnit.py
+++ b/gargantext/util/timeit_damnit.py
+import time
+
+def timing(f):
+    def wrap(*args):
+        time1 = time.time()
+        ret = f(*args)
+        time2 = time.time()
+        print('%s function took %0.3f ms' % (f.__name__, (time2-time1)*1000.0))
+        return ret
+    return wrap
--- a/gargantext/util/toolchain/parsing.py
+++ b/gargantext/util/toolchain/parsing.py
@@ -5,7 +5,8 @@ from gargantext.constants import *
 from collections import defaultdict, Counter
 from re          import sub
 from gargantext.util.languages import languages, detect_lang
-import time
+
+


 def add_lang(hyperdata, observed_languages, skipped_languages):
@@ -54,12 +55,11 @@ def add_lang(hyperdata, observed_languages, skipped_languages):


    else:
-        print("DETECT LANG:", DETECT_LANG)
+        print("[WARNING] no language_* found in document [parsing.py]")
        if DETECT_LANG is False:
            skipped_languages.append("__unknown__")
            return observed_languages,skipped_languages

-        print("[WARNING] no language_* found in document [parsing.py]")
        #no language have been indexed
        #detectlang by joining on the DEFAULT_INDEX_FIELDS
        text_fields2 = list(set(DEFAULT_INDEX_FIELDS) & set(hyperdata.keys()))
@@ -84,6 +84,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
 def parse(corpus):
    try:
        print("PARSING")
+        print("DETECT_LANG?", DETECT_LANG)
        corpus.status('Docs', progress=0)
        #1 corpus => 1 or multi resources.path (for crawlers)
        resources = corpus.resources()