Commit 5484b242 authored by PkSM3's avatar PkSM3

Merge branch 'unstable' of ssh://delanoe.org:1979/gargantext into samuel

parents 1e519b00 26493012
......@@ -79,11 +79,14 @@ def project(request, project_id):
for corpus_id, corpus_name, document_count, processing in corpus_query:
#print(corpus_id, processing)
# Not optimized GOTO ISSUE L51
resource_type_id = (session.query(Resource.type_id)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
.join(Node, Node.id == Node_Resource.node_id )
.filter(Node.id==corpus_id)
.first())[0]
try:
resource_type_id = (session.query(Resource.type_id)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
.join(Node, Node.id == Node_Resource.node_id )
.filter(Node.id==corpus_id)
.first())[0]
except:
pass
if not corpus_id in corpusID_dict:
if resource_type_id is None:
......
......@@ -9,11 +9,13 @@ from gargantext_web.db import get_or_create_node
from parsing.corpustools import *
import sqlalchemy as sa
from sqlalchemy.sql import func
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column
from sqlalchemy.orm import aliased
from collections import defaultdict
#from testlists import *
from math import log
......@@ -94,26 +96,12 @@ def getStemmer(corpus):
print("No language found")
def stemIt(ngram):
return(set(map(lambda x: stemmer.stem(x), ngram[1].split(' '))))
stems = list(map(lambda x: stemmer.stem(x), ngram.split(' ')))
stems.sort()
return(str(' '.join(stems)))
return(stemIt)
def equals(ngram1,ngram2, f=None):
'''
equals :: (Int,String) -> (Int,String) -> Bool
detect if two ngrams are equivalent according to a function :: String -> [String]
'''
if ngram1[0] == ngram2[0]:
# if ngrams have same id then they are the same (and they can not be
# grouped)
return(False)
else:
try:
return f(ngram1) == f(ngram2)
except:
return(False)
PrintException()
def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
'''
group ngrams according to a function (stemming or lemming)
......@@ -121,38 +109,72 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
dbg = DebugTime('Corpus #%d - group' % corpus.id)
dbg.show('Group')
spec,cvalue = getNgrams(corpus, limit_inf=limit_inf, limit_sup=limit_sup)
#spec,cvalue = getNgrams(corpus, limit_inf=limit_inf, limit_sup=limit_sup)
#list_to_check=cvalue.union(spec)
if how == 'Stem':
stemIt = getStemmer(corpus)
group_to_insert = list()
group_to_insert = set()
node_group = get_or_create_node(nodetype='Group', corpus=corpus)
miam_to_insert = set()
miam_node = get_or_create_node(nodetype='MiamList', corpus=corpus)
list_to_check=cvalue.union(spec)
for n in spec:
group = filter(lambda x: equals(n,x,f=stemIt),list_to_check)
miam_to_insert.add((miam_node.id, n[0],1))
#print([n for n in group])
for g in group:
if (miam_node.id, g[0], 1) not in miam_to_insert:
group_to_insert.append((node_group.id, n[0], g[0], 1))
print(n[1], "=", g[1])
# TODO see here if coherent add in miam or group...
# Deleting previous groups
frequency = sa.func.count(NodeNgram.weight)
ngrams = (session.query(Ngram.id, Ngram.terms, frequency )
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, Node.id == NodeNgram.node_id)
.filter(Node.parent_id==corpus.id, Node.type_id==cache.NodeType['Document'].id)
.group_by(Ngram.id)
.order_by(desc(frequency))
#.all()
.limit(limit_sup)
)
#group = defaultdict(lambda : defaultdict())
ids_dict = dict()
mainform_dict = dict()
count_dict = dict()
for n in ngrams:
stem = str(stemIt(n[1]))
if stem is not None :
ids_dict[stem] = ids_dict.get(stem, []) + [n[0]]
count = count_dict.get(stem, 0)
if n[2] > count:
mainform_dict[stem] = n[0]
count_dict[stem] = n[2]
for key in mainform_dict.keys():
miam_to_insert.add((miam_node.id, mainform_dict[key], 1))
try:
ids = ids_dict[key]
if ids is not None and len(ids) > 1:
for ngram_id in ids :
if ngram_id != mainform_dict[key]:
group_to_insert.add((node_group.id, mainform_dict[key], ngram_id, 1))
except exception as e:
print(e)
print(group[stem])
#print(ids_dict[stem])
# # Deleting previous groups
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id == node_group.id).delete()
# # Deleting previous ngrams miam list
session.query(NodeNgram).filter(NodeNgram.node_id == miam_node.id).delete()
session.commit()
bulk_insert(NodeNgramNgram
, ('node_id', 'ngramx_id', 'ngramy_id', 'score')
, [data for data in group_to_insert])
for n in group_to_insert:
#print(n)
miam_to_insert.add((miam_node.id, n[1], 1))
# Deleting previous ngrams miam list
session.query(NodeNgram).filter(NodeNgram.node_id == miam_node.id).delete()
bulk_insert(NodeNgram, ('node_id', 'ngram_id', 'weight'), [data for data in list(miam_to_insert)])
......@@ -131,7 +131,6 @@ def compute_tfidf_global(corpus):
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__tf (
ngram_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
......
......@@ -10,6 +10,8 @@ class NgramsExtractor:
This method can be overriden.
"""
def __init__(self, rule="{<JJ.*>*<NN.*|>+<JJ.*>*}"):
# TODO add this regex
#'^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?(CD.,)??)+?((PREP.?|DET.?,|IN.?,|CC.?,|\?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?)+?)*?$'
self.start()
self._label = "NP"
self._rule = self._label + ": " + rule
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment