Commit f6a14f94 authored by Administrator's avatar Administrator

nouveau fichier : ngram/stemLem.py

Functions to be imported in order to manage lists and create stems.

	nouveau fichier : ngram/lists.py
Functions to be imported to manage lists.
parent cab96c0f
import sys
from admin.utils import PrintException
from gargantext_web.db import NodeNgram
from gargantext_web.db import *
from parsing.corpustools import *
import sqlalchemy
from sqlalchemy.sql import func
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column
from sqlalchemy.orm import aliased
# from gargantext_web.db import Node, get_cursor
def nodeList(user_id=None, corpus_id=None, typeList='MiamList'):
'''
nodeList : get or create NodeList.
nodeList :: Integer -> Integer -> String -> [Node]
user_id :: Integer
corpus_id :: Integer
typeList :: String, Type of the Node that should be created
[Node] :: List of Int, returned or created by the function
'''
if corpus_id is not None and user_id is not None:
# Nodes are either in root_list or user_list
root_list = ['Stem', 'Lem']
user_list = ['MiamList', 'StopList', 'MainList']
if typeList in user_list:
nodes = session.query(Node).filter(
Node.user_id == user_id,
Node.parent_id==corpus_id,
Node.type_id == cache.NodeType[typeList].id
).all()
elif typeList in root_list:
nodes = session.query(Node).filter(
Node.type_id == cache.NodeType[typeList].id
).all()
else:
print('typeList not supported yet')
sys.exit(0)
if nodes == []:
node = Node(user_id = user_id,
parent_id=corpus_id,
type_id=cache.NodeType[typeList].id,
name="First default Node " + str(typeList))
session.add(node)
session.commit()
return([(node.id, node.name),])
else:
return([(node.id, node.name) for node in nodes])
else:
print("Usage (Warning): Need corpus_id and user_id")
def stopList(user_id=None, corpus_id=None,
stop_id=None,
reset=False, limit=None
):
'''
Compute the stopList and returns its Node.id
'''
if stop_id is None:
stop_id = nodeList(user_id=user_id,
corpus_id=corpus_id,
typeList='StopList')
# according to type of corpus, choose the right default stopList
def doList(
type_list='miam',
user_id=None, corpus_id=None,
miam_id=None, stop_id=None, main_id=None,
lem_id=None, stem_id=None, cvalue_id=None, group_id=None,
reset=True, limit=None
):
'''
Compute the miamList and returns its Node.id
miamList = allList - stopList
where:
allList = all Ngrams
stopList = all Stop Ngrams
OR
Compute the mainList : main Forms
mainList = miamList - (stem|lem|group|cvalue) List
where:
group = Words grouped manually by user
stem = equivalent Words which are stemmed (but the main form)
lem = equivalent Words which are lemmatized (but the main form)
cvalue = equivalent N-Words according to C-Value (but the main form)
'''
if type_list not in ['miam', 'main']:
print('Type List supported: \'miam\' or \'main\'')
sys.exit(0)
try:
list_dict = {
'miam' : { 'type' : 'MiamList', 'id' : miam_id},
'stop' : { 'type' : 'StopList', 'id' : stop_id},
}
if 'main' == type_list:
list_dict.update(
{
'main' : { 'type' : 'MainList', 'id' : main_id},
'stem' : { 'type' : 'Stem', 'id' : stem_id},
#'lem' : { 'type' : 'LemList', 'id' : lem_id},
#'group' : { 'type' : 'Group', 'id' : group_id},
}
)
for list_ in list_dict.keys():
if list_dict[list_]['id'] is None:
list_dict[list_]['id'] = nodeList(user_id=user_id,
corpus_id=corpus_id,
typeList=list_dict[list_]['type'])[0][0]
# Delete previous List ?
# By default, miamList is computed each time
if reset is True:
session.query(NodeNgram).filter(
NodeNgram.node_id == list_dict[type_list]['id']
).delete()
except:
PrintException()
stopNgram = aliased(NodeNgram)
if 'miam' == type_list:
query = (session.query(
literal_column(str(list_dict['miam']['id'])).label("node_id"),
Ngram.id,
func.count(),
)
.select_from(Ngram)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, NodeNgram.node_id == Node.id)
.outerjoin(stopNgram,
and_(stopNgram.ngram_id == Ngram.id,
stopNgram.node_id == list_dict['stop']['id']))
.filter(Node.parent_id == corpus_id)
.filter(Node.type_id == cache.NodeType['Document'].id)
.filter(stopNgram.id == None )
.group_by(Ngram.id)
)
elif 'main' == type_list:
# Query to get Ngrams for main list
query = (session.query(
literal_column(str(list_dict['main']['id'])).label("node_id"),
Ngram.id,
func.count(),
)
.select_from(Ngram)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.filter(NodeNgram.node_id == list_dict['miam']['id'])
)
if stem_id is not None:
# Query with Stems Result need to be checked before prod
snn1 = aliased(NodeNgramNgram)
snn2 = aliased(NodeNgramNgram)
query = (query.outerjoin(snn1,
and_(snn1.ngramx_id == Ngram.id,
snn1.node_id == list_dict['stem']['id']
)
)
.outerjoin(snn2,
and_(snn1.ngramy_id == snn2.ngramy_id,
snn2.node_id == list_dict['stem']['id'],
snn1.ngramx_id < snn2.ngramx_id
)
)
.filter(snn2.id == None)
)
# Specific group by:
if stem_id is not None:
query = query.group_by(Ngram.id, snn1.ngramx_id)
else:
query = query.group_by(Ngram.id)
# here add filter for size of the ngram
# Order result by occurrences descending
query = query.order_by(desc(func.count()))
# Adding specific filters
if limit is not None:
query = query.limit(limit)
else:
query = query.all()
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], query)
return(list_dict[type_list]['id'])
from admin.utils import PrintException
from gargantext_web.db import *
from parsing.corpustools import *
from gargantext_web.db import NodeNgram
from sqlalchemy import and_
from gargantext_web.db import get_cursor, bulk_insert
def get_ngramogram(corpus, limit=None):
"""
Ngram is a composition of ograms (ogram = 1gram)
"""
try:
query = (session
.query(Ngram.id, Ngram.terms)
.outerjoin(NgramNgram, NgramNgram.ngram_id == Ngram.id)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, NodeNgram.node_id == Node.id)
.filter(Node.parent_id == corpus.id, Node.type_id == cache.NodeType['Document'].id)
.filter(Ngram.n > 1)
.filter(NgramNgram.id == None)
.group_by(Ngram.id, Ngram.terms)
)
#print(str(query))
if isinstance(limit, (int,)):
query = query.limit(limit)
return(query.all())
except Exception as error:
PrintException()
def split_ngram(ngram):
if isinstance(ngram, str):
count = 0
result = list()
ngram_splitted = ngram.split(' ')
for x in ngram_splitted:
if count <= len(ngram_splitted):
result.append((ngram_splitted[count], count))
count += 1
return(result)
else:
print("Parameter should be a string.")
def insert_ngramngram(ngramngram):
ngrams = list()
for n in ngramngram:
for i in split_ngram(n[1]):
ngrams.append((n[0], i[0], 1, i[1]))
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__ngram (
id INT,
ngram_id INT,
terms VARCHAR(255) NOT NULL,
terms_id INT,
n INT,
position INT
);
''')
bulk_insert('tmp__ngram', ['ngram_id', 'terms', 'n', 'position'], ngrams, cursor=cursor)
cursor.execute('''
UPDATE
tmp__ngram
SET
terms_id = ngram.id
FROM
%s AS ngram
WHERE
tmp__ngram.terms = ngram.terms
''' % (Ngram.__table__.name,))
cursor.execute('''
INSERT INTO
%s (n, terms)
SELECT
n, terms
FROM
tmp__ngram
WHERE
terms_id IS NULL
''' % (Ngram.__table__.name,))
cursor.execute('''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngram.terms
AND
tmp__ngram.id IS NULL
''' % (Ngram.__table__.name,))
ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngram')
for row in cursor.fetchall():
ngram_ids[row[1]] = row[0]
db.commit()
return(ngram_ids)
return(result)
def get_ngrams(corpus, unstemmed=True, unlemmatized=False, n=1, limit=None, count_all=False):
'''
Node with NodeType 'Stem' should be created at the root of the project.
'''
if unstemmed is True:
node_ = session.query(Node).filter(Node.type_id == cache.NodeType['Stem'].id).first()
try:
query = (session
.query(Ngram.id, Ngram.terms)
.outerjoin(NodeNgramNgram, and_(
NodeNgramNgram.ngramx_id == Ngram.id,
NodeNgramNgram.node_id==node_.id)
)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, NodeNgram.node_id == Node.id)
.filter(Node.parent_id == corpus.id, Node.type_id == cache.NodeType['Document'].id)
.filter(NodeNgramNgram.id == None)
.filter(Ngram.n == n)
.group_by(Ngram.id, Ngram.terms)
)
#print(str(query))
if isinstance(limit, (int,)):
query = query.limit(limit)
if count_all is True:
return(query.count())
else:
return(query.all())
except Exception as error:
print("Error Query:", error)
def get_stems(corpus, n=1, limit=None,
node_stem=session.query(Node).filter(
Node.type_id==cache.NodeType['Stem'].id).first()):
'''
get_stems :: Corpus -> [Stem]
'''
result = set()
if corpus.language_id is None or corpus.language_id == cache.Language['en'].id:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
#stemmer.stem('honeybees')
elif corpus.language_id == cache.Language['fr'].id:
from nltk.stem.snowball import FrenchStemmer
stemmer = FrenchStemmer()
#stemmer.stem('abeilles')
for ngram_id, word in get_ngrams(corpus, limit=limit, n=n):
result.add((node_stem.id, ngram_id, stemmer.stem(word), n))
return(result)
def get_lems(corpus, n=1, limit=None, node_stem=cache.Node['Lem']):
'''
get_stems :: Corpus -> [Lem]
'''
result = set()
if corpus.language_id is None or corpus.language_id == cache.Language['en'].id:
from nltk.wordnet import PorterStemmer
stemmer = PorterStemmer()
#stemmer.stem('honeybees')
elif corpus.language_id == cache.Language['fr'].id:
from nltk.stem.snowball import FrenchStemmer
stemmer = FrenchStemmer()
#stemmer.stem('abeilles')
for ngram_id, word in get_ngrams(corpus, limit=limit, n=n):
result.add((node_stem.id, ngram_id, stemmer.stem(word), n))
return(result)
def insert_ngrams(stems):
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__ngram (
id INT,
terms VARCHAR(255) NOT NULL,
n INT
);
''')
bulk_insert('tmp__ngram', ['terms', 'n'], stems, cursor=cursor)
cursor.execute('''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%s AS ngram
WHERE
tmp__ngram.terms = ngram.terms
''' % (Ngram.__table__.name,))
cursor.execute('''
INSERT INTO
%s (n, terms)
SELECT
n, terms
FROM
tmp__ngram
WHERE
id IS NULL
''' % (Ngram.__table__.name,))
cursor.execute('''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngram.terms
AND
tmp__ngram.id IS NULL
''' % (Ngram.__table__.name,))
ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngram')
for row in cursor.fetchall():
ngram_ids[row[1]] = row[0]
db.commit()
return(ngram_ids)
def insert_nodengramstem(node_ngram_stem):
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__nnn (
id INT,
node_id INT,
ngramx_id INT,
ngramy_id INT
);
''')
bulk_insert('tmp__nnn',
['node_id', 'ngramx_id', 'ngramy_id'],
node_ngram_stem, cursor=cursor)
# nnn = NodeNgramNgram
cursor.execute('''
UPDATE
tmp__nnn
SET
id = nnn.id
FROM
%s AS nnn
WHERE
tmp__nnn.node_id = nnn.node_id
AND
tmp__nnn.ngramx_id = nnn.ngramx_id
AND
tmp__nnn.ngramy_id = nnn.ngramy_id
''' % (NodeNgramNgram.__table__.name,))
cursor.execute('''
INSERT INTO
%s (node_id, ngramx_id, ngramy_id, score)
SELECT
node_id, ngramx_id, ngramy_id, 1
FROM
tmp__nnn
WHERE
id is NULL
''' % (NodeNgramNgram.__table__.name,))
db.commit()
def stem_corpus(corpus_id=None):
'''
Returns Int as id of the Stem Node
stem_corpus :: Int
'''
corpus = session.query(Node).filter(Node.id == corpus_id).first()
print('Number of new ngrams to stem:',
get_ngrams(corpus, n=2, count_all=True))
if corpus is not None:
try:
result = get_stems(corpus, n=2)
stems = set([(stem[2], stem[3]) for stem in result])
print('Number of new stems', len(stems))
stem_ids = insert_ngrams(stems)
node_ngram_stem = set([ (ngram[0],
ngram[1],
stem_ids[ngram[2]]
) for ngram in list(result) ])
print(list(node_ngram_stem)[:3])
insert_nodengramstem(node_ngram_stem)
except:
PrintException()
else:
print('Usage: stem_corpus(corpus_id=corpus.id)')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment