Commit 9e7284d2 authored by Romain Loth's avatar Romain Loth

Merge branch 'c24b-stable' into romain-stable-patch2

parents b1fce79d 22a96c99
......@@ -313,7 +313,7 @@ DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional
# at indexing after extraction)
# TAGGING options -----------------------------------------
#activate lang detection?
DETECT_LANG = True
DETECT_LANG = False
# Defaults INDEXED Fields for ngrams extraction
# put longest field first in order to make detection language more efficient
DEFAULT_INDEX_FIELDS = ('abstract','title' )
......
from gargantext.constants import *
from langdetect import detect, DetectorFactory
import time
def timing(f):
def wrap(*args):
time1 = time.time()
ret = f(*args)
time2 = time.time()
print ('function took %0.3f ms' %((time2-time1)*1000.0))
return ret
return wrap
class Language:
def __init__(self, iso2=None, iso3=None,full_name=None, name=None):
self.iso2 = iso2
self.iso3 = iso3
self.name = name
self.full_name = full_name
self.implemented = iso2 in LANGUAGES
def __str__(self):
......@@ -38,10 +27,10 @@ class Languages(dict):
languages = Languages()
@timing
def detect_lang(text):
DetectorFactory.seed = 0
return languages[detect(text)].iso2
return languages[detect(text)]
import pycountry
pycountry_keys = (
......
......@@ -53,6 +53,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
#print(LANGUAGES.keys())
tagger_bots = {lang: load_tagger(lang) for lang in corpus.hyperdata["languages"] \
if lang != "__unknown__"}
tagger_bots["__unknown__"] = load_tagger("en")
print("#TAGGERS LOADED: ", tagger_bots)
supported_taggers_lang = tagger_bots.keys()
print("#SUPPORTED TAGGER LANGS", supported_taggers_lang)
......
......@@ -21,7 +21,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
skipped_languages.append(hyperdata["language_iso2"])
return observed_languages,skipped_languages
observed_languages.append(hyperdata["language_iso2"])
return observed_languages,skipped_languages
return hyperdata, observed_languages,skipped_languages
elif "language_iso3" in hyperdata.keys():
......@@ -32,53 +32,57 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
skipped_languages.append(lang)
return observed_languages,skipped_languages
observed_languages.append(lang)
return observed_languages,skipped_languages
return hyperdata, observed_languages,skipped_languages
except KeyError:
print ("LANG not referenced", (hyperdata["language_iso3"]))
skipped_languages.append(hyperdata["language_iso3"])
return observed_languages,skipped_languages
return hyperdata, observed_languages,skipped_languages
elif "language_fullname" in hyperdata.keys():
elif "language_name" in hyperdata.keys():
try:
#convert
lang = languages[hyperdata["language_fullname"]].iso2
lang = languages[hyperdata["language_name"]].iso2
if lang not in LANGUAGES.keys():
skipped_languages.append(lang)
return observed_languages,skipped_languages
return hyperdata, observed_languages,skipped_languages
observed_languages.append(lang)
return observed_languages,skipped_languages
return hyperdata, observed_languages,skipped_languages
except KeyError:
print ("LANG Not referenced", (hyperdata["language_fullname"]))
skipped_languages.append(hyperdata["language_fullname"])
return observed_languages,skipped_languages
print ("LANG Not referenced", (hyperdata["language_name"]))
skipped_languages.append(hyperdata["language_name"])
return hyperdata, observed_languages,skipped_languages
else:
print("[WARNING] no language_* found in document [parsing.py]")
if DETECT_LANG is False:
skipped_languages.append("__unknown__")
return observed_languages,skipped_languages
#skipped_languages.append("__unknown__")
hyperdata["language_iso2"] = "__unknown__"
return hyperdata, observed_languages,skipped_languages
#no language have been indexed
#detectlang by joining on the DEFAULT_INDEX_FIELDS
text_fields2 = list(set(DEFAULT_INDEX_FIELDS) & set(hyperdata.keys()))
print(len(text_fields2))
if len(text_fields2) < 2:
print("[WARNING] missing %s key" %text_fields)
text = " ".join([hyperdata[k] for k in text_fields2])
if len(text) < 10:
hyperdata["error"] = "Error: no TEXT fields to index"
skipped_languages.append("__unknown__")
return observed_languages,skipped_languages
return hyperdata, observed_languages,skipped_languages
else:
#detect_lang return iso2
lang = detect_lang(text)
if lang not in LANGUAGES.keys():
skipped_languages.append(lang)
return observed_languages,skipped_languages
observed_languages.append(lang)
return observed_languages,skipped_languages
for k in ["iso2", "iso3", "name"]:
hyperdata["language_"+k] = getattr(lang, k)
if lang.iso2 not in LANGUAGES.keys():
#hyperdata["language_iso2"] = "__unknown__"
skipped_languages.append(lang.iso2)
return hyperdata, observed_languages,skipped_languages
observed_languages.append(lang.iso2)
return hyperdata, observed_languages,skipped_languages
def parse(corpus):
......@@ -122,8 +126,8 @@ def parse(corpus):
except Exception as error :
hyperdata["error"] = "Error normalize_chars"
#adding lang into record hyperdata
observed_languages, skipped_languages = add_lang(hyperdata, observed_languages, skipped_languages)
#adding lang into record hyperdata JUST if not declared
hyperdata,observed_languages, skipped_languages = add_lang(hyperdata, observed_languages, skipped_languages)
# save as corpus DB child
# ----------------
......
......@@ -24,6 +24,8 @@
{% block content %}
<div class="container theme-showcase" role="main">
<div class="jumbotron">
<div class="row">
......@@ -35,6 +37,7 @@
<!--<h3> {{number}} corpora </h3>-->
{% endif %}
</div>
<div class="col-md-4">
<p>
{% if donut %}
......@@ -68,6 +71,16 @@
<div class="container">
<!-- Modal -->
<div id="wait" class="modal row col-md-6">
<div class="modal-dialog ">
<h2>Your file has been uploaded ! </h2>
<h2>Gargantext need some time to eat it.</h2>
<h2>Duration depends on the size of the dish.</h2>
<a class="btn btn-primary btn-lg" href="/projects/{{ project.id }}" title="Click and test by yourself">Continue on Gargantext</a>
</div>
</div>
{% if list_corpora %}
{% for key, corpora in list_corpora.items %}
......@@ -184,7 +197,9 @@
{% endif %}
<!-- Modal -->
<div class="modal fade" id="stack1" tabindex="-1" role="dialog" aria-labelledby="myModalLabel" aria-hidden="true">
<div class="modal-dialog">
<div class="modal-content">
......@@ -318,7 +333,9 @@
// console.log(data)
setTimeout(
function() {
location.reload();
$('#addcorpus').modal('hide');
$("#wait").modal("show");
}, 3000);
},
error: function(result) {
......@@ -563,9 +580,11 @@
// console.log(data)
setTimeout(
function() {
$('#addcorpus').modal('hide');
$("#wait").modal("show");
location.reload();
}, 5000);
//location.reload();
}, 3000);
},
error: function(result) {
console.log("in testISTEX(). Data not found");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment