Commit f0cc9050 authored by delanoe's avatar delanoe

[FEAT] adding option to tokenize monograms without nlp, + stop tools

parent dcfe453b
...@@ -42,7 +42,7 @@ def apply_workflow(corpus_id): ...@@ -42,7 +42,7 @@ def apply_workflow(corpus_id):
parse_resources(corpus) parse_resources(corpus)
update_processing(corpus, 2) update_processing(corpus, 2)
extract_ngrams(corpus, ['title', 'abstract']) extract_ngrams(corpus, ['title', 'abstract'], nlp=True)
update_processing(corpus, 3) update_processing(corpus, 3)
ngram_workflow(corpus) ngram_workflow(corpus)
......
...@@ -699,9 +699,17 @@ def sankey_csv(request, corpus_id): ...@@ -699,9 +699,17 @@ def sankey_csv(request, corpus_id):
corpus = session.query(Node).filter(Node.id==corpus_id).first() corpus = session.query(Node).filter(Node.id==corpus_id).first()
data = [ data = [
["source", "target", "value"] ["source", "target", "value"]
, ["Elvis_1", "Elvis_2", 1] , ["Comment_1", "Theme_1", 1]
, ["Elvis_2", "Elvis_3", 2]
, ["Barry", "Elvis_3", 2] , ["Comment_2", "Theme_2", 2]
, ["Comment_3", "Theme_2", 2]
, ["Comment_7", "Theme_1", 2]
, ["Comment_8", "Theme_3", 2]
, ["Theme_1", "Reco_par_1", 2]
, ["Theme_2", "Reco_par_2", 2]
, ["Theme_2", "Reco_par_5", 2]
, ["Theme_3", "Reco_par_5", 1]
] ]
return(CsvHttpResponse(data)) return(CsvHttpResponse(data))
......
...@@ -151,4 +151,19 @@ session.commit() ...@@ -151,4 +151,19 @@ session.commit()
###f.close() ###f.close()
## ##
## ##
from ngram.stop import importStopList
root = session.query(Node).filter(Node.type_id==cache.NodeType['Root'].id).first()
importStopList(root, '/srv/gargantext/init/stop_lists/fr.txt', 'fr')
importStopList(root, '/srv/gargantext/init/stop_lists/en.txt', 'en')
root = session.query(Node).filter(Node.type_id==cache.NodeType['Root'].id).first()
#importStopList(root, '/srv/gargantext/init/stop_lists/fr.txt', 'fr')
importStopList(root, '/srv/gargantext/init/stop_lists/en.txt', 'en')
#exit() #exit()
...@@ -60,15 +60,17 @@ def getNgrams(corpus=None, limit_inf=600, limit_sup=3000): ...@@ -60,15 +60,17 @@ def getNgrams(corpus=None, limit_inf=600, limit_sup=3000):
#print([n for n in tfidf_ngrams]) #print([n for n in tfidf_ngrams])
def list2set(_list,_set): def list2set(_list):
_set = set()
for n in _list: for n in _list:
_set.add((n[0],n[1])) _set.add((n[0],n[1]))
return(_set)
cvalue_set = set() cvalue_set = set()
spec_set = set() spec_set = set()
list2set(cvalue_ngrams,cvalue_set) cvalue_set = list2set(cvalue_ngrams)
list2set(spec_ngrams,spec_set) spec_set = list2set(spec_ngrams)
cvalue_setDiff = cvalue_set.difference(spec_set) cvalue_setDiff = cvalue_set.difference(spec_set)
......
...@@ -12,6 +12,8 @@ from sqlalchemy import desc, asc, or_, and_, Date, cast, select ...@@ -12,6 +12,8 @@ from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column from sqlalchemy import literal_column
from sqlalchemy.orm import aliased from sqlalchemy.orm import aliased
from ngram.tools import insert_ngrams
import csv
def compute_miam(corpus,limit=500): def compute_miam(corpus,limit=500):
''' '''
...@@ -50,5 +52,43 @@ def compute_miam(corpus,limit=500): ...@@ -50,5 +52,43 @@ def compute_miam(corpus,limit=500):
dbg.show('Miam computed') dbg.show('Miam computed')
def insert_miam(corpus, ngrams=None, path_file_csv=None):
dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus)
session.query(NodeNgram).filter(NodeNgram.node_id==node_miam.id).delete()
session.commit()
stop_words = set()
miam_words = set()
if path_file_csv is not None:
file_csv = open(path_file_csv, "r")
reader = csv.reader(file_csv, delimiter=',')
for line in reader:
word = line[0]
tag = line[4]
if tag == '1':
miam_words.add((word, 1))
elif tag == '0':
stop_words.add((word, 1))
miam_ids = insert_ngrams(miam_words)
print(miam_ids)
limit = len(list(miam_words))
data = zip(
[node_miam.id for i in range(1,limit)]
, [miam_ids[n] for n in miam_ids.keys()]
, [1 for i in range(1,limit)]
)
#print([d for d in data])
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
file_csv.close()
dbg.show('Miam computed')
#corpus = session.query(Node).filter(Node.id==556113).first()
#insert_miam(corpus=corpus, path_file_csv="Thesaurus_tag.csv")
...@@ -2,49 +2,122 @@ ...@@ -2,49 +2,122 @@
#from admin.env import * #from admin.env import *
#from ngram.stemLem import * #from ngram.stemLem import *
import re
from admin.utils import PrintException from admin.utils import PrintException
from gargantext_web.db import NodeNgram,NodeNodeNgram from gargantext_web.db import NodeNgram,NodeNodeNgram
from gargantext_web.db import get_or_create_node, session from gargantext_web.db import cache, session, get_or_create_node
from sqlalchemy.sql import func from sqlalchemy.sql import func
from sqlalchemy import desc, asc, or_, and_, Date, cast, select from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column from sqlalchemy import literal_column
from sqlalchemy.orm import aliased from sqlalchemy.orm import aliased
from ngram.tools import insert_ngrams
from analysis.lists import WeightedList, UnweightedList
def importStopList(node,filename,language='fr'):
with open(filename, "r") as f:
stop_list = f.read().splitlines()
stop_words = set(stop_list)
stop_ids = insert_ngrams([(word, len(word.split(' '))) for word in stop_words])
stop_node = get_or_create_node(nodetype='StopList', corpus=node)
stop_node.language_id = cache.Language[language].id
session.add(stop_node)
session.commit()
size = len(list(stop_words))
def computeStop(corpus,size=100): data = zip(
[stop_node.id for i in range(0,size)]
, [stop_ids[word] for word in list(stop_words)]
, [-1 for i in range(0,size)]
)
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
def isStopWord(ngram, stop_words=None):
'''
ngram :: (Int, String) => (ngram_id, ngram_terms)
stop_words :: Set of String
(to avoid SQL query each time isStopWord is invoked, get in as parameter)
'''
word = ngram[1]
if word in stop_words:
return(True)
def test_match(word, regex):
format_regex = re.compile(regex)
if format_regex.match(word) :
return(True)
for regex in ["(.*)\d(.*)"
, "^.{1,2}$"
, "(.*)(\.)(.*)"
, "(.*)(\,)(.*)"
, "(.*)(study)(.*)"
, "(.*)(result)(.*)"
, "(.*)(année)(.*)"
, "(.*)(temps)(.*)"
, "(.*)(%)(.*)"
, "(.*)(\{)(.*)"
, "(.*)(terme)(.*)"
, "(.*)(différent)(.*)"
, "(.*)(travers)(.*)"
, "(.*)(:|\|)(.*)"
] :
if test_match(word, regex) is True :
return(True)
def compute_stop(corpus,size=2000,debug=False):
''' '''
do some statitics on all stop lists of database of the same type do some statitics on all stop lists of database of the same type
''' '''
node_stop = get_or_create_node(nodetype='StopList', corpus=corpus) stop_node = get_or_create_node(nodetype='StopList', corpus=corpus)
Stop=aliased(NodeNgram) miam_node = get_or_create_node(nodetype='MiamList', corpus=corpus)
top_spec = (session.query(NodeNodeNgram.ngram_id, NodeNodeNgram.score) # TODO do a function to get all stop words with social scores
.outerjoin(Stop, Stop.ngram_id == NodeNodeNgram.ngram_id) root = session.query(Node).filter(Node.type_id == cache.NodeType['Root'].id).first()
.filter(NodeNodeNgram.nodex_id==node_spec.id) root_stop_id = get_or_create_node(nodetype='StopList', corpus=root).id
.filter(Stop.node_id==node_stop.id)
.order_by(desc(NodeNodeNgram.score)) stop_words = (session.query(Ngram.terms)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.filter(NodeNgram.node_id == root_stop_id)
.all()
)
top_words = (session.query(Ngram.id, Ngram.terms)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.filter(NodeNgram.node_id == miam_node.id)
.order_by(desc(NodeNgram.weight))
.limit(size) .limit(size)
) )
ngrams_to_stop = filter(lambda x: isStopWord(x,stop_words=stop_words), top_words)
stop = WeightedList({ n[0] : -1 for n in ngrams_to_stop})
stop.save(stop_node.id)
node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus) miam = UnweightedList(miam_node.id)
session.query(NodeNgram).filter(NodeNgram.node_id==node_miam.id).delete()
data = zip( new_miam = miam - stop
[node_miam.id for i in range(1,size)] new_miam.save(miam_node.id)
, [1 for i in range(1,size)]
, [n[0] for n in top_spec]
)
#print([d for d in data])
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
# data = zip(
# [stop_node.id for i in range(0,size)]
# , [ngram[0] for ngram in ngrams_to_stop]
# , [-1 for i in range(0,size)]
# )
# bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
#corpus=session.query(Node).filter(Node.id==244250).first()
#computeMiam(corpus)
#corpus=session.query(Node).filter(Node.id==545461).first()
#compute_stop(corpus)
...@@ -28,8 +28,6 @@ def insert_ngrams(ngrams,get='terms-id'): ...@@ -28,8 +28,6 @@ def insert_ngrams(ngrams,get='terms-id'):
%s AS ngram %s AS ngram
WHERE WHERE
tmp__ngram.terms = ngram.terms tmp__ngram.terms = ngram.terms
AND
tmp__ngram.n = ngram.n
''' % (Ngram.__table__.name,)) ''' % (Ngram.__table__.name,))
cursor.execute(''' cursor.execute('''
...@@ -67,7 +65,6 @@ def insert_ngrams(ngrams,get='terms-id'): ...@@ -67,7 +65,6 @@ def insert_ngrams(ngrams,get='terms-id'):
db.commit() db.commit()
return(ngram_ids) return(ngram_ids)
def insert_nodengramngram(nodengramngram): def insert_nodengramngram(nodengramngram):
db, cursor = get_cursor() db, cursor = get_cursor()
......
...@@ -177,6 +177,8 @@ def parse_resources(corpus, user=None, user_id=None): ...@@ -177,6 +177,8 @@ def parse_resources(corpus, user=None, user_id=None):
# ngrams extraction # ngrams extraction
from .NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor, NgramsExtractor from .NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor, NgramsExtractor
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
class NgramsExtractors(defaultdict): class NgramsExtractors(defaultdict):
def __init__(self): def __init__(self):
# English # English
...@@ -201,7 +203,7 @@ class NgramsExtractors(defaultdict): ...@@ -201,7 +203,7 @@ class NgramsExtractors(defaultdict):
ngramsextractors = NgramsExtractors() ngramsextractors = NgramsExtractors()
def extract_ngrams(corpus, keys): def extract_ngrams(corpus, keys, nlp=True):
dbg = DebugTime('Corpus #%d - ngrams' % corpus.id) dbg = DebugTime('Corpus #%d - ngrams' % corpus.id)
default_language_iso2 = None if corpus.language_id is None else cache.Language[corpus.language_id].iso2 default_language_iso2 = None if corpus.language_id is None else cache.Language[corpus.language_id].iso2
# query the hyperdata associated with the given keys # query the hyperdata associated with the given keys
...@@ -220,7 +222,7 @@ def extract_ngrams(corpus, keys): ...@@ -220,7 +222,7 @@ def extract_ngrams(corpus, keys):
ngrams_data = set() ngrams_data = set()
ngrams_language_data = set() ngrams_language_data = set()
ngrams_tag_data = set() #ngrams_tag_data = set()
node_ngram_list = defaultdict(lambda: defaultdict(int)) node_ngram_list = defaultdict(lambda: defaultdict(int))
for nodeinfo in hyperdata_query: for nodeinfo in hyperdata_query:
...@@ -237,17 +239,25 @@ def extract_ngrams(corpus, keys): ...@@ -237,17 +239,25 @@ def extract_ngrams(corpus, keys):
ngramsextractor = ngramsextractors[language_iso2] ngramsextractor = ngramsextractors[language_iso2]
for text in nodeinfo[2:]: for text in nodeinfo[2:]:
if text is not None and len(text): if text is not None and len(text):
ngrams = ngramsextractor.extract_ngrams(text.replace("[","").replace("]","")) if nlp == True:
ngrams = ngramsextractor.extract_ngrams(text.replace("[","").replace("]",""))
else:
ngrams = wordpunct_tokenize(text.lower())
for ngram in ngrams: for ngram in ngrams:
n = len(ngram) if nlp == True:
terms = ' '.join([token for token, tag in ngram]).lower() n = len(ngram)
terms = ' '.join([token for token, tag in ngram]).lower()
else:
terms = ngram
n = 1
# TODO BUG here # TODO BUG here
if n == 1: #if n == 1:
#tag_id = cache.Tag[ngram[0][1]].id #tag_id = cache.Tag[ngram[0][1]].id
tag_id = 1 # tag_id = 1
#print('tag_id', tag_id) #print('tag_id', tag_id)
elif n > 1: #elif n > 1:
tag_id = 1 # tag_id = 1
#tag_id = cache.Tag[ngram[0][1]].id #tag_id = cache.Tag[ngram[0][1]].id
#tag_id = cache.Tag['NN'].id #tag_id = cache.Tag['NN'].id
#tag_id = 14 #tag_id = 14
...@@ -255,7 +265,7 @@ def extract_ngrams(corpus, keys): ...@@ -255,7 +265,7 @@ def extract_ngrams(corpus, keys):
node_ngram_list[node_id][terms] += 1 node_ngram_list[node_id][terms] += 1
ngrams_data.add((terms[:255],n)) ngrams_data.add((terms[:255],n))
ngrams_language_data.add((terms, language_id)) ngrams_language_data.add((terms, language_id))
ngrams_tag_data.add((terms, tag_id)) #ngrams_tag_data.add((terms, tag_id))
# insert ngrams to temporary table # insert ngrams to temporary table
dbg.show('find ids for the %d ngrams' % len(ngrams_data)) dbg.show('find ids for the %d ngrams' % len(ngrams_data))
...@@ -263,12 +273,12 @@ def extract_ngrams(corpus, keys): ...@@ -263,12 +273,12 @@ def extract_ngrams(corpus, keys):
ngram_ids = insert_ngrams(ngrams_data) ngram_ids = insert_ngrams(ngrams_data)
dbg.show('insert associations') dbg.show('insert associations')
node_ngram_data = list() node_ngram_data = set()
for node_id, ngrams in node_ngram_list.items(): for node_id, ngrams in node_ngram_list.items():
for terms, weight in ngrams.items(): for terms, weight in ngrams.items():
try: try:
ngram_id = ngram_ids[terms] ngram_id = ngram_ids[terms]
node_ngram_data.append((node_id, ngram_id, weight, )) node_ngram_data.add((node_id, ngram_id, weight, ))
except Exception as e: except Exception as e:
print("err01:",e) print("err01:",e)
bulk_insert(Node_Ngram, ['node_id', 'ngram_id', 'weight'], node_ngram_data, cursor=cursor) bulk_insert(Node_Ngram, ['node_id', 'ngram_id', 'weight'], node_ngram_data, cursor=cursor)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment