Commit 401ab95a authored by c24b's avatar c24b

LANGDETECT set to TRUE

parent 47d38d27
...@@ -313,7 +313,7 @@ DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional ...@@ -313,7 +313,7 @@ DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional
# at indexing after extraction) # at indexing after extraction)
# TAGGING options ----------------------------------------- # TAGGING options -----------------------------------------
#activate lang detection? #activate lang detection?
DETECT_LANG = False DETECT_LANG = True
# Defaults INDEXED Fields for ngrams extraction # Defaults INDEXED Fields for ngrams extraction
# put longest field first in order to make detection language more efficient # put longest field first in order to make detection language more efficient
DEFAULT_INDEX_FIELDS = ('abstract','title' ) DEFAULT_INDEX_FIELDS = ('abstract','title' )
......
from gargantext.constants import * from gargantext.constants import *
from langdetect import detect, DetectorFactory from langdetect import detect, DetectorFactory
import time
def timing(f):
def wrap(*args):
time1 = time.time()
ret = f(*args)
time2 = time.time()
print ('function took %0.3f ms' %((time2-time1)*1000.0))
return ret
return wrap
...@@ -28,6 +38,7 @@ class Languages(dict): ...@@ -28,6 +38,7 @@ class Languages(dict):
languages = Languages() languages = Languages()
@timing
def detect_lang(text): def detect_lang(text):
DetectorFactory.seed = 0 DetectorFactory.seed = 0
return languages[detect(text)].iso2 return languages[detect(text)].iso2
......
import time
def timing(f):
def wrap(*args):
time1 = time.time()
ret = f(*args)
time2 = time.time()
print('%s function took %0.3f ms' % (f.__name__, (time2-time1)*1000.0))
return ret
return wrap
...@@ -5,7 +5,8 @@ from gargantext.constants import * ...@@ -5,7 +5,8 @@ from gargantext.constants import *
from collections import defaultdict, Counter from collections import defaultdict, Counter
from re import sub from re import sub
from gargantext.util.languages import languages, detect_lang from gargantext.util.languages import languages, detect_lang
import time
def add_lang(hyperdata, observed_languages, skipped_languages): def add_lang(hyperdata, observed_languages, skipped_languages):
...@@ -54,12 +55,11 @@ def add_lang(hyperdata, observed_languages, skipped_languages): ...@@ -54,12 +55,11 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
else: else:
print("DETECT LANG:", DETECT_LANG) print("[WARNING] no language_* found in document [parsing.py]")
if DETECT_LANG is False: if DETECT_LANG is False:
skipped_languages.append("__unknown__") skipped_languages.append("__unknown__")
return observed_languages,skipped_languages return observed_languages,skipped_languages
print("[WARNING] no language_* found in document [parsing.py]")
#no language have been indexed #no language have been indexed
#detectlang by joining on the DEFAULT_INDEX_FIELDS #detectlang by joining on the DEFAULT_INDEX_FIELDS
text_fields2 = list(set(DEFAULT_INDEX_FIELDS) & set(hyperdata.keys())) text_fields2 = list(set(DEFAULT_INDEX_FIELDS) & set(hyperdata.keys()))
...@@ -84,6 +84,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages): ...@@ -84,6 +84,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
def parse(corpus): def parse(corpus):
try: try:
print("PARSING") print("PARSING")
print("DETECT_LANG?", DETECT_LANG)
corpus.status('Docs', progress=0) corpus.status('Docs', progress=0)
#1 corpus => 1 or multi resources.path (for crawlers) #1 corpus => 1 or multi resources.path (for crawlers)
resources = corpus.resources() resources = corpus.resources()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment