Commit 844de0c2 authored by delanoe's avatar delanoe

Merge branch 'refactoring-rom' into refactoring-alex

parents 85f8dd96 32495844
......@@ -70,8 +70,10 @@ class _BaseClass:
class Translations(_BaseClass):
def __init__(self, source=None):
def __init__(self, source=None, just_items=False):
self.items = defaultdict(int)
# TODO lazyinit for groups
# (not necessary for save)
self.groups = defaultdict(set)
if source is None:
return
......@@ -83,15 +85,35 @@ class Translations(_BaseClass):
.filter(NodeNgramNgram.node_id == source)
)
self.items.update(query)
for key, value in self.items.items():
self.groups[value].add(key)
if not just_items:
for key, value in self.items.items():
self.groups[value].add(key)
elif isinstance(source, Translations):
self.items.update(source.items)
self.groups.update(source.groups)
if not just_items:
self.groups.update(source.groups)
elif hasattr(source, '__iter__'):
# not very intuitive with update here:
# /!\ source must be "reversed" (like self.items)
# bad exemple
# In > couples = [(1, 2), (1, 3)]
# In > tlko = Translations(couples)
# Out> Translations {1: 3}
# In > tlko.save()
# DB-- 3 -> 1
# good exemple
# In > reversed_couples = [(2, 1), (3, 1)]
# In > tlok = Translations(reversed_couples)
# Out> Translations {2: 1, 3: 1}
# In > tlok.save()
# DB-- 1 -> 2
# DB-- 1 -> 3
self.items.update(source)
for key, value in self.items.items():
self.groups[value].add(key)
if not just_items:
for key, value in self.items.items():
self.groups[value].add(key)
else:
raise TypeError
......@@ -138,7 +160,7 @@ class Translations(_BaseClass):
# insert new data
bulk_insert(
NodeNgramNgram,
('node_id', 'ngram2_id', 'ngram1_id', 'score'),
('node_id', 'ngram2_id', 'ngram1_id', 'weight'),
((node_id, key, value, 1.0) for key, value in self.items.items())
)
......
......@@ -5,6 +5,7 @@ from .ngrams_extraction import extract_ngrams
from gargantext.util.db import session
from gargantext.models import Node
from .group import compute_groups
def parse_extract(corpus):
# retrieve corpus from database from id
......@@ -20,3 +21,7 @@ def parse_extract(corpus):
print('CORPUS #%d: parsed' % (corpus.id))
extract_ngrams(corpus)
print('CORPUS #%d: extracted ngrams' % (corpus.id))
# temporary ngram lists workflow
group_id = compute_groups(corpus)
print('CORPUS #%d: new grouplist = #%i' % (corpus.id, group_id))
from gargantext.models import Node, NodeNgramNgram
from gargantext.util.db import session
from gargantext.util.lists import Translations
# to convert fr => french :/
from gargantext.util.languages import languages
from nltk.stem.snowball import SnowballStemmer
from re import split as resplit
from collections import defaultdict, Counter
def prepare_stemmers(corpus):
"""
Returns *several* stemmers (one for each language in the corpus)
(as a dict of stemmers with key = language_iso2)
"""
stemmers_by_lg = {
# always get a generic stemmer in case language code unknown
'__unknown__' : SnowballStemmer("english")
}
for lgiso2 in corpus.hyperdata['languages'].keys():
lgname = languages[lgiso2].name.lower()
stemmers_by_lg[lgiso2] = SnowballStemmer(lgname)
return stemmers_by_lg
def compute_groups(corpus, stoplist_id = None):
"""
1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma
2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2)
"""
stop_ngrams_ids = {}
# we will need the ngrams of the stoplist to filter
if stoplist_id is not None:
for id in session.query(NodeNgram.id).filter(NodeNgram.node_id == stoplist_id).all():
stop_ngrams_ids[id[0]] = True
# 1) compute stems/lemmas
# and group if same stem/lemma
stemmers = prepare_stemmers(corpus)
# todo dict {lg => {ngrams_todo} }
todo_ngrams_per_lg = defaultdict(set)
# res dict { commonstem: {ngram_1:freq_1 ,ngram_2:freq_2 ,ngram_3:freq_3} }
my_groups = defaultdict(Counter)
# preloop per doc to sort ngrams by language
for doc in corpus.children():
if ('language_iso2' in doc.hyperdata):
lgid = doc.hyperdata['language_iso2']
else:
lgid = "__unknown__"
# doc.ngrams is an sql query (ugly but useful intermediate step)
# FIXME: move the counting and stoplist filtering up here
for ngram_pack in doc.ngrams.all():
todo_ngrams_per_lg[lgid].add(ngram_pack)
# --------------------
# long loop per ngrams
for (lgid,todo_ngs) in todo_ngrams_per_lg.items():
# fun: word::str => stem::str
stem_it = stemmers[lgid].stem
for ng in todo_ngs:
doc_wei = ng[0]
ngram = ng[1] # Ngram obj
# break if in STOPLIST
if ngram.id in stop_ngrams_ids:
next
lexforms = [lexunit for lexunit in resplit(r'\W+',ngram.terms)]
# STEM IT, and this term's stems will become a new grouping key...
stemseq = " ".join([stem_it(lexfo) for lexfo in lexforms])
# ex:
# groups['post'] = {'poste':3, 'poster':5, 'postés':2...}
# groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...}
my_groups[stemseq][ngram.id] += doc_wei
del todo_ngrams_per_lg
# now serializing all groups to a list of couples
ng_couples = []
addcouple = ng_couples.append
for grped_ngramids in my_groups.values():
if len(grped_ngramids) > 1:
# first find most frequent term in the counter
winner_id = grped_ngramids.most_common(1)[0][0]
for ngram_id in grped_ngramids:
if ngram_id != winner_id:
addcouple((winner_id, ngram_id))
del my_groups
# 2) Create the list node
the_group = Node()
the_group.typename = "GROUPLIST"
the_group.name = "Group (src:%s)" % corpus.name[0:10]
the_group.parent_id = corpus.id # could use corpus.parent_id if free list
the_group.user_id = corpus.user_id
# and save the node
session.add(the_group)
session.commit()
the_id = the_group.id
# 3) Save each grouping couple to DB thanks to Translations.save() table
ndngng_list = Translations(
[(sec,prim) for (prim,sec) in ng_couples],
just_items=True
)
# ...referring to the list node we just got
ndngng_list.save(the_id)
return the_id
......@@ -2,11 +2,16 @@ from gargantext.util.db import *
from gargantext.models import *
from gargantext.constants import *
from collections import defaultdict
def parse(corpus):
try:
documents_count = 0
corpus.status('parsing', progress=0)
# will gather info about languages
observed_languages = defaultdict(int)
# retrieve resource information
for resource in corpus.resources():
# information about the resource
......@@ -22,6 +27,7 @@ def parse(corpus):
hyperdata = hyperdata,
)
session.add(document)
observed_languages[hyperdata["language_iso2"]] += 1
if documents_count % BATCH_PARSING_SIZE == 0:
corpus.status('parsing', progress=documents_count)
corpus.save_hyperdata()
......@@ -29,6 +35,8 @@ def parse(corpus):
documents_count += 1
# update info about the resource
resource['extracted'] = True
# add a corpus-level info about languages
corpus.hyperdata['languages'] = observed_languages
# commit all changes
corpus.status('parsing', progress=documents_count, complete=True)
corpus.save_hyperdata()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment