Commit 08fc1367 authored by Romain Loth's avatar Romain Loth

Merge branch 'refactoring-alex' into refactoring-rom

parents 3b2c4c53 119705e5
from gargantext.util.db import *
from gargantext.util.db_cache import *
from gargantext.constants import *
from gargantext.models.ngrams import Ngram, NodeNgram,\
NodeNodeNgram, NodeNgramNgram
from sqlalchemy.sql import func
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column
from sqlalchemy.orm import aliased
from gargantext.util.toolchain.ngram_tools import insert_ngrams
import csv
def compute_mapList(corpus_id,limit=500,n=1, session=None):
'''
According to Specificities and stoplist,
'''
monograms_part = 0.005
monograms_limit = round(limit * monograms_part)
multigrams_limit = limit - monograms_limit
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
list_main_id = session.query(Node.id).filter(
Node.typename == "MAINLIST",
Node.parent_id == corpus_id).first()
list_stop_id = session.query(Node.id).filter(
Node.typename == "STOPLIST",
Node.parent_id == corpus_id).first()
list_group_id = session.query(Node.id).filter(
Node.typename == "GROUPLIST",
Node.parent_id == corpus_id).first()
score_spec_id = session.query(Node.id).filter(
Node.typename == "SPECIFICITY",
Node.parent_id == corpus_id).first()
ListMain=aliased(NodeNgram)
ListStop=aliased(NodeNgram)
ListGroup=aliased(NodeNgramNgram)
ScoreSpec=aliased(NodeNodeNgram)
# FIXME outerjoin does not work with current SqlAlchemy
# lines below the query do the job but it can be improved
query = (session.query(ScoreSpec.ngram_id, ScoreSpec.score)
.join(ListMain, ScoreSpec.ngram_id == ListMain.ngram_id)
.join(Ngram, Ngram.id == ScoreSpec.ngram_id)
#.outerjoin(ListGroup, Group.ngramy_id == ScoreSpec.ngram_id)
#.outerjoin(ListStop, Stop.ngram_id == ScoreSpec.ngram_id)
.filter(ListMain.node_id == list_main_id)
#.filter(ListGroup.node_id == list_group_id)
#.filter(ListStop.node_id == list_stop_id)
.filter(ScoreSpec.nodex_id == score_spec_id)
)
top_monograms = (query
.filter(Ngram.n == 1)
.order_by(desc(ScoreSpec.score))
.limit(monograms_limit)
)
top_multigrams = (query
.filter(Ngram.n >= 2)
.order_by(desc(ScoreSpec.score))
.limit(multigrams_limit)
)
stop_ngrams = (session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == list_stop_id)
.all()
)
grouped_ngrams = (session.query(NodeNgramNgram.ngramy_id)
.filter(NodeNgramNgram.node_id == list_group_id)
.all()
)
list_map_id = session.query(Node.id).filter(
Node.parent_id==corpus_id,
Node.typename == "MAPLIST"
).first()
if list_map_id == None:
corpus = cache.Node[corpus_id]
user_id = corpus.user_id
list_map = Node(name="MAPLIST", parent_id=corpus_id, user_id=user_id, typename="MAPLIST")
session.add(list_map)
session.commit()
list_map_id = list_map.id
session.query(NodeNgram).filter(NodeNgram.node_id==list_map_id).delete()
session.commit()
data = zip(
[list_map_id for i in range(1,limit)]
, [n[0] for n in list(top_multigrams) + list(top_monograms)
if (n[0],) not in list(stop_ngrams)
]
, [1 for i in range(1,limit)]
)
#print([d for d in data])
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
dbg.show('MapList computed')
from gargantext.util.db import *
from gargantext.util.db_cache import *
from gargantext.constants import *
from gargantext.models.users import User
from gargantext.models.nodes import Node
from gargantext.models.ngrams import Ngram, NodeNgram
import re
import sqlalchemy as sa
from sqlalchemy.sql import func
from sqlalchemy.orm import aliased
from sqlalchemy import desc, asc, or_, and_, Date, cast, select, literal_column
#from ngram.tools import insert_ngrams
def isStopWord(ngram, stop_words=None):
'''
ngram :: (Int, String) => (ngram_id, ngram_terms)
stop_words :: Set of String
(to avoid SQL query each time isStopWord is invoked, get in as parameter)
'''
word = ngram[1]
if word in stop_words:
return(True)
def test_match(word, regex):
format_regex = re.compile(regex)
if format_regex.match(word) :
return(True)
for regex in [
"^.{1,2}$"
, "(.*)\d(.*)"
, "(.*)(\.)(.*)"
, "(.*)(\,)(.*)"
, "(.*)(< ?/?p ?>)(.*)" # marques de paragraphes
, "(.*)(study)(.*)"
, "(.*)(xx|xi|xv)(.*)"
, "(.*)(result)(.*)"
, "(.*)(année|nombre|moitié)(.*)"
, "(.*)(temps)(.*)"
, "(.*)(%)(.*)"
, "(.*)(\{)(.*)"
, "(.*)(terme)(.*)"
, "(.*)(différent)(.*)"
, "(.*)(travers)(.*)"
, "(.*)(:|\|)(.*)"
] :
if test_match(word, regex) is True :
return(True)
def create_gargantua_resources():
gargantua_id = session.query(User.id).filter(User.username=="gargantua").first()
project = Node(
name="Resources",
user_id=gargantua_id,
typename="PROJECT")
stopList = Node(name="STOPLIST", parent_id=project.id, user_id=gargantua_id, typename="STOPLIST")
session.add(project)
session.add(stopList)
session.commit()
def compute_stop(corpus_id,stopList_id=None,limit=2000,debug=False):
'''
Create list of stop words.
TODO do a function to get all stop words with social scores
'''
# Get the StopList if it exist or create a new one
# At this step of development, a new StopList should be created
if stopList_id == None:
stopList_id = session.query(Node.id).filter(
Node.parent_id==corpus_id,
Node.typename == "STOPLIST"
).first()
if stopList_id == None:
corpus = cache.Node[corpus_id]
user_id = corpus.user_id
stopList = Node(name="STOPLIST", parent_id=corpus_id, user_id=user_id, typename="STOPLIST")
session.add(stopList)
session.commit()
stopList_id = stopList.id
# For tests only
if debug == True:
session.query(Node).filter(Node.id==stopList_id).delete()
session.commit()
# Get common resources, all common StopWords on the platform
## First get the id of the StopList of Gargantua super user
gargantua_id = session.query(User.id).filter(User.username=="gargantua").first()
rootStopList_id = session.query(Node.id).filter(
Node.user_id == gargantua_id,
Node.typename == "STOPLIST"
).first()
## Then get all the stop words
## stop_words :: [String]
stop_words = (session.query(Ngram.terms)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.filter(NodeNgram.node_id == rootStopList_id)
.all()
)
print([n for n in stop_words])
## Get the ngrams
## ngrams :: [(Int, String, Int)]
frequency = sa.func.count( NodeNgram.weight )
ngrams = (session.query( Ngram.id, Ngram.terms, frequency )
.join( NodeNgram, NodeNgram.ngram_id == Ngram.id )
.join( Node, Node.id == NodeNgram.node_id )
.filter( Node.parent_id == corpus_id,
Node.typename == "DOCUMENT")
.group_by( Ngram.id )
.order_by( desc( frequency ) )
#.limit(limit)
.all()
)
ngrams_to_stop = filter(lambda x: isStopWord(x,stop_words=stop_words), ngrams)
print([n for n in ngrams_to_stop])
stop = LISTTYPES["STOPLIST"]({ n[0] : -1 for n in ngrams_to_stop})
stop.save(stopList_id)
#
from gargantext.util.db import *
from gargantext.util.db_cache import *
from gargantext.constants import *
from gargantext.models.ngrams import Ngram, NodeNgram, NodeNgramNgram
def insert_ngrams(ngrams,get='terms-id'):
'''
insert_ngrams :: [(String, Int)] -> dict[terms] = id
'''
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__ngram (
id INT,
terms VARCHAR(255) NOT NULL,
n INT
);
''')
bulk_insert('tmp__ngram', ['terms', 'n'], ngrams, cursor=cursor)
cursor.execute('''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%s AS ngram
WHERE
tmp__ngram.terms = ngram.terms
''' % (Ngram.__table__.name,))
cursor.execute('''
INSERT INTO
%s (terms, n)
SELECT
terms, n
FROM
tmp__ngram
WHERE
id IS NULL
''' % (Ngram.__table__.name,))
cursor.execute('''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngram.terms
AND
ngram.n = tmp__ngram.n
AND
tmp__ngram.id IS NULL
''' % (Ngram.__table__.name,))
ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngram')
for row in cursor.fetchall():
ngram_ids[row[1]] = row[0]
db.commit()
return(ngram_ids)
from gargantext_web.db import get_session, cache, get_cursor
from gargantext_web.db import Node, NodeNgram, NodeNodeNgram
from gargantext_web.db import get_or_create_node
#from admin.utils import DebugTime
def compute_occs(corpus, debug=True):
'''
compute_occs :: Corpus -> IO ()
'''
#dbg = DebugTime('Corpus #%d - OCCURRENCES' % corpus.id)
#dbg.show('Calculate occurrences')
occs_node = get_or_create_node(nodetype='Occurrences', corpus=corpus, mysession=mysession)
#print(occs_node.id)
(session.query(NodeNodeNgram)
.filter(NodeNodeNgram.nodex_id==occs_node.id).delete()
)
session.commit()
db, cursor = get_cursor()
cursor.execute('''
INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score)
SELECT
%d AS nodex_id,
%d AS nodey_id,
nodengram.ngram_id AS ngram_id,
SUM(nodengram.weight) AS score
FROM
%s AS nodengram
INNER JOIN
%s AS node ON nodengram.node_id = node.id
WHERE
node.parent_id = %d
AND
node.type_id = %d
GROUP BY
nodengram.ngram_id
''' % ( NodeNodeNgram.__table__.name
, occs_node.id, corpus.id
, NodeNgram.__table__.name
, Node.__table__.name
, corpus.id
, cache.NodeType['Document'].id
)
)
db.commit()
if debug is True:
data = session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==occs_node.id).all()
print([n for n in data])
from gargantext.util.db import *
from gargantext.util.db_cache import *
from gargantext.constants import *
from gargantext.util.analysis.cooccurrences import do_cooc
from gargantext.models.ngrams import Ngram, NodeNgram,\
NodeNgramNgram, NodeNodeNgram
import numpy as np
import pandas as pd
from collections import defaultdict
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
def specificity(cooc_id=None, corpus=None, limit=100, session=None):
'''
Compute the specificity, simple calculus.
'''
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
.order_by(NodeNgramNgram.score)
.limit(limit)
)
matrix = defaultdict(lambda : defaultdict(float))
for cooccurrence in cooccurrences:
matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score
x = pd.DataFrame(matrix).fillna(0)
x = x / x.sum(axis=1)
xs = x.sum(axis=1)
ys = x.sum(axis=0)
m = ( xs - ys) / (2 * (x.shape[0] - 1))
m = m.sort(inplace=False)
#node = get_or_create_node(nodetype='Specificity',corpus=corpus)
node = session.query(Node).filter(
Node.parent_id==corpus_id,
Node.typename == "SPECIFICITY"
).first()
if node == None:
corpus = cache.Node[corpus_id]
user_id = corpus.user_id
node = Node(name="SPECIFICITY", parent_id=corpus_id, user_id=user_id, typename="SPECIFICITY")
session.add(node)
session.commit()
data = zip( [node.id for i in range(1,m.shape[0])]
, [corpus.id for i in range(1,m.shape[0])]
, m.index.tolist()
, m.values.tolist()
)
session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==node.id).delete()
session.commit()
bulk_insert(NodeNodeNgram, ['nodex_id', 'nodey_id', 'ngram_id', 'score'], [d for d in data])
return(node.id)
def compute_specificity(corpus,limit=100, session=None):
'''
Computing specificities as NodeNodeNgram.
All workflow is the following:
1) Compute the cooc matrix
2) Compute the specificity score, saving it in database, return its Node
'''
#dbg = DebugTime('Corpus #%d - specificity' % corpus.id)
#list_cvalue = get_or_create_node(nodetype='Cvalue', corpus=corpus)
cooc_id = do_cooc(corpus=corpus, cvalue_id=list_cvalue.id,limit=limit)
specificity(cooc_id=cooc_id,corpus=corpus,limit=limit,session=session)
#dbg.show('specificity')
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_specificity(corpus)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment