Commit f0cc9050 authored by delanoe's avatar delanoe

[FEAT] adding option to tokenize monograms without nlp, + stop tools

parent dcfe453b
......@@ -42,7 +42,7 @@ def apply_workflow(corpus_id):
parse_resources(corpus)
update_processing(corpus, 2)
extract_ngrams(corpus, ['title', 'abstract'])
extract_ngrams(corpus, ['title', 'abstract'], nlp=True)
update_processing(corpus, 3)
ngram_workflow(corpus)
......
......@@ -699,9 +699,17 @@ def sankey_csv(request, corpus_id):
corpus = session.query(Node).filter(Node.id==corpus_id).first()
data = [
["source", "target", "value"]
, ["Elvis_1", "Elvis_2", 1]
, ["Elvis_2", "Elvis_3", 2]
, ["Barry", "Elvis_3", 2]
, ["Comment_1", "Theme_1", 1]
, ["Comment_2", "Theme_2", 2]
, ["Comment_3", "Theme_2", 2]
, ["Comment_7", "Theme_1", 2]
, ["Comment_8", "Theme_3", 2]
, ["Theme_1", "Reco_par_1", 2]
, ["Theme_2", "Reco_par_2", 2]
, ["Theme_2", "Reco_par_5", 2]
, ["Theme_3", "Reco_par_5", 1]
]
return(CsvHttpResponse(data))
......
......@@ -151,4 +151,19 @@ session.commit()
###f.close()
##
##
from ngram.stop import importStopList
root = session.query(Node).filter(Node.type_id==cache.NodeType['Root'].id).first()
importStopList(root, '/srv/gargantext/init/stop_lists/fr.txt', 'fr')
importStopList(root, '/srv/gargantext/init/stop_lists/en.txt', 'en')
root = session.query(Node).filter(Node.type_id==cache.NodeType['Root'].id).first()
#importStopList(root, '/srv/gargantext/init/stop_lists/fr.txt', 'fr')
importStopList(root, '/srv/gargantext/init/stop_lists/en.txt', 'en')
#exit()
......@@ -60,15 +60,17 @@ def getNgrams(corpus=None, limit_inf=600, limit_sup=3000):
#print([n for n in tfidf_ngrams])
def list2set(_list,_set):
def list2set(_list):
_set = set()
for n in _list:
_set.add((n[0],n[1]))
return(_set)
cvalue_set = set()
spec_set = set()
list2set(cvalue_ngrams,cvalue_set)
list2set(spec_ngrams,spec_set)
cvalue_set = list2set(cvalue_ngrams)
spec_set = list2set(spec_ngrams)
cvalue_setDiff = cvalue_set.difference(spec_set)
......
......@@ -12,6 +12,8 @@ from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column
from sqlalchemy.orm import aliased
from ngram.tools import insert_ngrams
import csv
def compute_miam(corpus,limit=500):
'''
......@@ -50,5 +52,43 @@ def compute_miam(corpus,limit=500):
dbg.show('Miam computed')
def insert_miam(corpus, ngrams=None, path_file_csv=None):
dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus)
session.query(NodeNgram).filter(NodeNgram.node_id==node_miam.id).delete()
session.commit()
stop_words = set()
miam_words = set()
if path_file_csv is not None:
file_csv = open(path_file_csv, "r")
reader = csv.reader(file_csv, delimiter=',')
for line in reader:
word = line[0]
tag = line[4]
if tag == '1':
miam_words.add((word, 1))
elif tag == '0':
stop_words.add((word, 1))
miam_ids = insert_ngrams(miam_words)
print(miam_ids)
limit = len(list(miam_words))
data = zip(
[node_miam.id for i in range(1,limit)]
, [miam_ids[n] for n in miam_ids.keys()]
, [1 for i in range(1,limit)]
)
#print([d for d in data])
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
file_csv.close()
dbg.show('Miam computed')
#corpus = session.query(Node).filter(Node.id==556113).first()
#insert_miam(corpus=corpus, path_file_csv="Thesaurus_tag.csv")
......@@ -2,49 +2,122 @@
#from admin.env import *
#from ngram.stemLem import *
import re
from admin.utils import PrintException
from gargantext_web.db import NodeNgram,NodeNodeNgram
from gargantext_web.db import get_or_create_node, session
from gargantext_web.db import cache, session, get_or_create_node
from sqlalchemy.sql import func
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column
from sqlalchemy.orm import aliased
from ngram.tools import insert_ngrams
from analysis.lists import WeightedList, UnweightedList
def importStopList(node,filename,language='fr'):
with open(filename, "r") as f:
stop_list = f.read().splitlines()
stop_words = set(stop_list)
stop_ids = insert_ngrams([(word, len(word.split(' '))) for word in stop_words])
stop_node = get_or_create_node(nodetype='StopList', corpus=node)
stop_node.language_id = cache.Language[language].id
session.add(stop_node)
session.commit()
size = len(list(stop_words))
def computeStop(corpus,size=100):
data = zip(
[stop_node.id for i in range(0,size)]
, [stop_ids[word] for word in list(stop_words)]
, [-1 for i in range(0,size)]
)
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
def isStopWord(ngram, stop_words=None):
'''
ngram :: (Int, String) => (ngram_id, ngram_terms)
stop_words :: Set of String
(to avoid SQL query each time isStopWord is invoked, get in as parameter)
'''
word = ngram[1]
if word in stop_words:
return(True)
def test_match(word, regex):
format_regex = re.compile(regex)
if format_regex.match(word) :
return(True)
for regex in ["(.*)\d(.*)"
, "^.{1,2}$"
, "(.*)(\.)(.*)"
, "(.*)(\,)(.*)"
, "(.*)(study)(.*)"
, "(.*)(result)(.*)"
, "(.*)(année)(.*)"
, "(.*)(temps)(.*)"
, "(.*)(%)(.*)"
, "(.*)(\{)(.*)"
, "(.*)(terme)(.*)"
, "(.*)(différent)(.*)"
, "(.*)(travers)(.*)"
, "(.*)(:|\|)(.*)"
] :
if test_match(word, regex) is True :
return(True)
def compute_stop(corpus,size=2000,debug=False):
'''
do some statitics on all stop lists of database of the same type
'''
node_stop = get_or_create_node(nodetype='StopList', corpus=corpus)
Stop=aliased(NodeNgram)
top_spec = (session.query(NodeNodeNgram.ngram_id, NodeNodeNgram.score)
.outerjoin(Stop, Stop.ngram_id == NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.nodex_id==node_spec.id)
.filter(Stop.node_id==node_stop.id)
.order_by(desc(NodeNodeNgram.score))
stop_node = get_or_create_node(nodetype='StopList', corpus=corpus)
miam_node = get_or_create_node(nodetype='MiamList', corpus=corpus)
# TODO do a function to get all stop words with social scores
root = session.query(Node).filter(Node.type_id == cache.NodeType['Root'].id).first()
root_stop_id = get_or_create_node(nodetype='StopList', corpus=root).id
stop_words = (session.query(Ngram.terms)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.filter(NodeNgram.node_id == root_stop_id)
.all()
)
top_words = (session.query(Ngram.id, Ngram.terms)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.filter(NodeNgram.node_id == miam_node.id)
.order_by(desc(NodeNgram.weight))
.limit(size)
)
ngrams_to_stop = filter(lambda x: isStopWord(x,stop_words=stop_words), top_words)
stop = WeightedList({ n[0] : -1 for n in ngrams_to_stop})
stop.save(stop_node.id)
node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus)
session.query(NodeNgram).filter(NodeNgram.node_id==node_miam.id).delete()
miam = UnweightedList(miam_node.id)
data = zip(
[node_miam.id for i in range(1,size)]
, [1 for i in range(1,size)]
, [n[0] for n in top_spec]
)
#print([d for d in data])
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
new_miam = miam - stop
new_miam.save(miam_node.id)
# data = zip(
# [stop_node.id for i in range(0,size)]
# , [ngram[0] for ngram in ngrams_to_stop]
# , [-1 for i in range(0,size)]
# )
# bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
#corpus=session.query(Node).filter(Node.id==244250).first()
#computeMiam(corpus)
#corpus=session.query(Node).filter(Node.id==545461).first()
#compute_stop(corpus)
......@@ -28,8 +28,6 @@ def insert_ngrams(ngrams,get='terms-id'):
%s AS ngram
WHERE
tmp__ngram.terms = ngram.terms
AND
tmp__ngram.n = ngram.n
''' % (Ngram.__table__.name,))
cursor.execute('''
......@@ -67,7 +65,6 @@ def insert_ngrams(ngrams,get='terms-id'):
db.commit()
return(ngram_ids)
def insert_nodengramngram(nodengramngram):
db, cursor = get_cursor()
......
......@@ -177,6 +177,8 @@ def parse_resources(corpus, user=None, user_id=None):
# ngrams extraction
from .NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor, NgramsExtractor
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
class NgramsExtractors(defaultdict):
def __init__(self):
# English
......@@ -201,7 +203,7 @@ class NgramsExtractors(defaultdict):
ngramsextractors = NgramsExtractors()
def extract_ngrams(corpus, keys):
def extract_ngrams(corpus, keys, nlp=True):
dbg = DebugTime('Corpus #%d - ngrams' % corpus.id)
default_language_iso2 = None if corpus.language_id is None else cache.Language[corpus.language_id].iso2
# query the hyperdata associated with the given keys
......@@ -220,7 +222,7 @@ def extract_ngrams(corpus, keys):
ngrams_data = set()
ngrams_language_data = set()
ngrams_tag_data = set()
#ngrams_tag_data = set()
node_ngram_list = defaultdict(lambda: defaultdict(int))
for nodeinfo in hyperdata_query:
......@@ -237,17 +239,25 @@ def extract_ngrams(corpus, keys):
ngramsextractor = ngramsextractors[language_iso2]
for text in nodeinfo[2:]:
if text is not None and len(text):
ngrams = ngramsextractor.extract_ngrams(text.replace("[","").replace("]",""))
if nlp == True:
ngrams = ngramsextractor.extract_ngrams(text.replace("[","").replace("]",""))
else:
ngrams = wordpunct_tokenize(text.lower())
for ngram in ngrams:
n = len(ngram)
terms = ' '.join([token for token, tag in ngram]).lower()
if nlp == True:
n = len(ngram)
terms = ' '.join([token for token, tag in ngram]).lower()
else:
terms = ngram
n = 1
# TODO BUG here
if n == 1:
#if n == 1:
#tag_id = cache.Tag[ngram[0][1]].id
tag_id = 1
# tag_id = 1
#print('tag_id', tag_id)
elif n > 1:
tag_id = 1
#elif n > 1:
# tag_id = 1
#tag_id = cache.Tag[ngram[0][1]].id
#tag_id = cache.Tag['NN'].id
#tag_id = 14
......@@ -255,7 +265,7 @@ def extract_ngrams(corpus, keys):
node_ngram_list[node_id][terms] += 1
ngrams_data.add((terms[:255],n))
ngrams_language_data.add((terms, language_id))
ngrams_tag_data.add((terms, tag_id))
#ngrams_tag_data.add((terms, tag_id))
# insert ngrams to temporary table
dbg.show('find ids for the %d ngrams' % len(ngrams_data))
......@@ -263,12 +273,12 @@ def extract_ngrams(corpus, keys):
ngram_ids = insert_ngrams(ngrams_data)
dbg.show('insert associations')
node_ngram_data = list()
node_ngram_data = set()
for node_id, ngrams in node_ngram_list.items():
for terms, weight in ngrams.items():
try:
ngram_id = ngram_ids[terms]
node_ngram_data.append((node_id, ngram_id, weight, ))
node_ngram_data.add((node_id, ngram_id, weight, ))
except Exception as e:
print("err01:",e)
bulk_insert(Node_Ngram, ['node_id', 'ngram_id', 'weight'], node_ngram_data, cursor=cursor)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment