Commit 1010b272 authored by c24b's avatar c24b

LANGDETECT set to TRUE

parent 65b8f9b3
......@@ -313,7 +313,7 @@ DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional
# at indexing after extraction)
# TAGGING options -----------------------------------------
#activate lang detection?
DETECT_LANG = False
DETECT_LANG = True
# Defaults INDEXED Fields for ngrams extraction
# put longest field first in order to make detection language more efficient
DEFAULT_INDEX_FIELDS = ('abstract','title' )
......
from gargantext.constants import *
from langdetect import detect, DetectorFactory
import time
def timing(f):
def wrap(*args):
time1 = time.time()
ret = f(*args)
time2 = time.time()
print ('function took %0.3f ms' %((time2-time1)*1000.0))
return ret
return wrap
......@@ -28,6 +38,7 @@ class Languages(dict):
languages = Languages()
@timing
def detect_lang(text):
DetectorFactory.seed = 0
return languages[detect(text)].iso2
......
import time
def timing(f):
def wrap(*args):
time1 = time.time()
ret = f(*args)
time2 = time.time()
print('%s function took %0.3f ms' % (f.__name__, (time2-time1)*1000.0))
return ret
return wrap
......@@ -5,7 +5,8 @@ from gargantext.constants import *
from collections import defaultdict, Counter
from re import sub
from gargantext.util.languages import languages, detect_lang
import time
def add_lang(hyperdata, observed_languages, skipped_languages):
......@@ -54,12 +55,11 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
else:
print("DETECT LANG:", DETECT_LANG)
print("[WARNING] no language_* found in document [parsing.py]")
if DETECT_LANG is False:
skipped_languages.append("__unknown__")
return observed_languages,skipped_languages
print("[WARNING] no language_* found in document [parsing.py]")
#no language have been indexed
#detectlang by joining on the DEFAULT_INDEX_FIELDS
text_fields2 = list(set(DEFAULT_INDEX_FIELDS) & set(hyperdata.keys()))
......@@ -84,6 +84,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
def parse(corpus):
try:
print("PARSING")
print("DETECT_LANG?", DETECT_LANG)
corpus.status('Docs', progress=0)
#1 corpus => 1 or multi resources.path (for crawlers)
resources = corpus.resources()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment