Commit 89c8268c authored by Romain Loth's avatar Romain Loth

prototype ngram toolchain in __init__.py (no mainlist nor maplist yet :/)

parent 61237884
......@@ -92,6 +92,8 @@ RESOURCETYPES = [
# },
]
# linguistic extraction parameters
DEFAULT_COOC_THRESHOLD = 4
# other parameters
# default number of docs POSTed to scrappers.views.py
......
......@@ -178,6 +178,8 @@ class WeightedContextIndex(_BaseClass):
def __init__(self, source=None):
self.items = defaultdict(float)
# £TODO
......@@ -222,7 +224,7 @@ class WeightedMatrix(_BaseClass):
# insert new data
bulk_insert(
NodeNgramNgram,
('node_id', 'ngram1_id', 'ngram2_id', 'score'),
('node_id', 'ngram1_id', 'ngram2_id', 'weight'),
((node_id, key1, key2, value) for key1, key2, value in self)
)
......
from .parsing import parse
from .ngrams_extraction import extract_ngrams
from .ngram_scores import compute_occurrences_local, compute_tfidf_local
from .list_stop import compute_stop
from .ngram_scores import compute_occurrences_local, compute_tfidf
from .ngram_coocs_tempo import compute_coocs
from .score_specificity import compute_specificity
from .list_map import compute_mapList # TEST
from .ngram_groups import compute_groups
from gargantext.util.db import session
from gargantext.models import Node
from datetime import datetime
def parse_extract(corpus):
# retrieve corpus from database from id
if isinstance(corpus, int):
......@@ -21,16 +28,47 @@ def parse_extract(corpus):
extract_ngrams(corpus)
print('CORPUS #%d: extracted ngrams' % (corpus.id))
# -------------------------------
# temporary ngram lists workflow
# -------------------------------
print('CORPUS #%d: [%s] starting ngram lists computation' % (corpus.id, t()))
# -> stoplist: compute + write (=> Node and NodeNgram)
stop_id = compute_stop(corpus)
print('CORPUS #%d: [%s] new stoplist node #%i' % (corpus.id, t(), stop_id))
# -> write local tfidf to Node and NodeNodeNgram
ltfidf_id = compute_tfidf(corpus, scope="local")
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# -> write global tfidf to Node and NodeNodeNgram
gtfidf_id = compute_tfidf(corpus, scope="global")
print('CORPUS #%d: [%s] new globaltfidf node #%i' % (corpus.id, t(), gtfidf_id))
# write occurrences to Node and NodeNodeNgram
occnd_id = compute_occurrences_local(corpus)
print('CORPUS #%d: new occs node #%i' % (corpus.id, occnd_id))
# ?? mainlist: compute + write (to Node and NodeNgram)
# mainlist_id = compute_mainlist(corpus)
# print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
# write local tfidf to Node and NodeNodeNgram
ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: new localtfidf node #%i' % (corpus.id, ltfidf_id))
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id = compute_coocs(corpus, stop_id = None)
print('CORPUS #%d: [%s] new cooccs node #%i' % (corpus.id, t(), cooc_id))
# write groups to Node and NodeNgramNgram
# ?? specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(cooc_id=cooc_id, corpus=corpus)
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), cooc_id))
# ?? maplist: compute + write (to Node and NodeNgram)
# map_id = compute_stop(corpus)
# print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id = compute_occurrences_local(corpus)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# -> write groups to Node and NodeNgramNgram
group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: new grouplist node #%i' % (corpus.id, group_id))
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
def t():
return datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
......@@ -2,15 +2,13 @@ from gargantext.util.db import *
from gargantext.util.db_cache import *
from gargantext.constants import *
from gargantext.models.users import User
from gargantext.models.nodes import Node
from gargantext.models.ngrams import Ngram, NodeNgram
from gargantext.util.db import session, aliased, func
from gargantext.util.lists import WeightedMatrix
from gargantext.models import User, Node, Ngram, NodeNgram
import re
import sqlalchemy as sa
from sqlalchemy.sql import func
from sqlalchemy.orm import aliased
from sqlalchemy import desc, asc, or_, and_, Date, cast, select, literal_column
from sqlalchemy import desc, asc
#from ngram.tools import insert_ngrams
def isStopWord(ngram, stop_words=None):
......@@ -23,20 +21,16 @@ def isStopWord(ngram, stop_words=None):
if word in stop_words:
return(True)
def test_match(word, regex):
format_regex = re.compile(regex)
if format_regex.match(word) :
return(True)
compiled_regexes = [] # to compile them only once
for regex in [
"^.{1,2}$"
, "(.*)\d(.*)"
, "(.*)(\.)(.*)"
# , "(.*)(\.)(.*)" trop fort (enlève les sigles !)
, "(.*)(\,)(.*)"
, "(.*)(< ?/?p ?>)(.*)" # marques de paragraphes
, "(.*)(study)(.*)"
, "(.*)(xx|xi|xv)(.*)"
, "(.*)\b(xx|xi|xv)\b(.*)"
, "(.*)(result)(.*)"
, "(.*)(année|nombre|moitié)(.*)"
, "(.*)(temps)(.*)"
......@@ -47,9 +41,15 @@ def isStopWord(ngram, stop_words=None):
, "(.*)(travers)(.*)"
, "(.*)(:|\|)(.*)"
] :
if test_match(word, regex) is True :
compiled_regexes.append(re.compile(regex))
for format_regex in compiled_regexes:
if format_regex.match(word):
# print("STOPLIST += '%s' (regex: %s)" % (word, format_regex.pattern))
return(True)
return False
def create_gargantua_resources():
gargantua_id = session.query(User.id).filter(User.username=="gargantua").first()
project = Node(
......@@ -61,32 +61,33 @@ def create_gargantua_resources():
session.add(stopList)
session.commit()
def compute_stop(corpus_id,stopList_id=None,limit=2000,debug=False):
def compute_stop(corpus, stopList_id=None, debug=False):
'''
Create list of stop words.
TODO do a function to get all stop words with social scores
'''
# Get the StopList if it exist or create a new one
# At this step of development, a new StopList should be created
if stopList_id == None:
stopList_id = session.query(Node.id).filter(
Node.parent_id==corpus_id,
Node.parent_id==corpus.id,
Node.typename == "STOPLIST"
).first()
if stopList_id == None:
corpus = cache.Node[corpus_id]
user_id = corpus.user_id
stopList = Node(name="STOPLIST", parent_id=corpus_id, user_id=user_id, typename="STOPLIST")
stopList = Node(name="STOPLIST",
parent_id=corpus.id,
user_id=corpus.user_id,
typename="STOPLIST")
session.add(stopList)
session.commit()
stopList_id = stopList.id
# For tests only
if debug == True:
session.query(Node).filter(Node.id==stopList_id).delete()
session.commit()
# Get common resources, all common StopWords on the platform
## First get the id of the StopList of Gargantua super user
gargantua_id = session.query(User.id).filter(User.username=="gargantua").first()
......@@ -101,16 +102,16 @@ def compute_stop(corpus_id,stopList_id=None,limit=2000,debug=False):
.filter(NodeNgram.node_id == rootStopList_id)
.all()
)
print([n for n in stop_words])
# print([n for n in stop_words])
## Get the ngrams
## ngrams :: [(Int, String, Int)]
frequency = sa.func.count( NodeNgram.weight )
frequency = func.count( NodeNgram.weight )
ngrams = (session.query( Ngram.id, Ngram.terms, frequency )
.join( NodeNgram, NodeNgram.ngram_id == Ngram.id )
.join( Node, Node.id == NodeNgram.node_id )
.filter( Node.parent_id == corpus_id,
.filter( Node.parent_id == corpus.id,
Node.typename == "DOCUMENT")
.group_by( Ngram.id )
.order_by( desc( frequency ) )
......@@ -119,9 +120,10 @@ def compute_stop(corpus_id,stopList_id=None,limit=2000,debug=False):
)
ngrams_to_stop = filter(lambda x: isStopWord(x,stop_words=stop_words), ngrams)
print([n for n in ngrams_to_stop])
# print([n for n in ngrams_to_stop])
stop = LISTTYPES["STOPLIST"]({ n[0] : -1 for n in ngrams_to_stop})
# stop = LISTTYPES["STOPLIST"]([n[0] for n in ngrams_to_stop])
stop.save(stopList_id)
#
return stopList_id
from gargantext.models import Node, NodeNgram, NodeNgramNgram
from gargantext.util.lists import WeightedMatrix
from gargantext.util.db import session, aliased, func
from gargantext.constants import DEFAULT_COOC_THRESHOLD
def compute_coocs(corpus,
threshold = DEFAULT_COOC_THRESHOLD,
weighted = False,
our_id = None,
stop_id = None,
symmetry_filter = True):
"""
Count how often some extracted terms appear
together in a small context (document)
throughout a larger context (corpus).
node_id | ngram_id | weight ngram1_id | ngram2_id | ucooc | wcooc |
--------+----------+-------- ----------+-----------+-------+-------+
MYDOC | 487 | 1 => 487 | 294 | 1 | 4 |
MYDOC | 294 | 3
Fill that info in DB:
- a *new* COOCCURRENCES node
- and all corresponding NodeNgramNgram rows
worse case complexity ~ O(N²/2) with N = number of ngrams
Parameters:
- threshold: on output ucooc count (previously called hapax)
- weighted: if False normal cooc to be saved as result
if True weighted cooc (experimental)
- stop_id: stoplist for filtering input ngrams
- TODO cvalue_id: allow a metric as input filter
- TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
- TODO start, end : filter on document date
(deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
- isMonopartite: ?? used a nodes_hyperdata_ngrams table ???
basic idea for one doc
======================
each pair of ngrams sharing same doc (node_id)
SELEC idx1.ngram_id, idx2.ngram_id
FROM nodes_ngrams AS idx1, nodes_ngrams AS idx2
---------------------------------
WHERE idx1.node_id = idx2.node_id <== that's cooc
---------------------------------
AND idx1.ngram_id <> idx2.ngram_id
AND idx1.node_id = MY_DOC ;
on entire corpus
=================
coocs for each doc :
- each given pair like (termA, termB) will likely appear several times
=> we do GROUP BY (x1.ngram_id, x2.ngram_id)
- normally we can count unique appearances of the pair (ucooc)
- we can count sum of sum of weights in the pair (wcooc or cofreq)
TODO
====
use WeightedMatrix
"""
# /!\ big combinatorial complexity /!\
# pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1
# 1.859.408 lignes pour la requête cooc simple
# 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
# docs of our corpus
docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "DOCUMENT")
.subquery()
)
# 2 x the occurrence index table
x1 = aliased(NodeNgram)
x2 = aliased(NodeNgram)
# cooccurrences columns definition
ucooc = func.count(x1.ngram_id).label("ucooc")
# 1) MAIN DB QUERY
coocs_query = (
session.query(x1.ngram_id, x2.ngram_id, ucooc)
.filter(x1.node_id == x2.node_id) # <- by definition of cooc
.filter(x1.ngram_id != x2.ngram_id) # <- b/c not with itself
.filter(x1.node_id.in_(docids_subquery)) # <- b/c within corpus
.group_by(x1.ngram_id, x2.ngram_id)
)
# 2) INPUT FILTERS (reduce N before O(N²))
# £TODO add possibility to restrict to the mainlist
if stop_id:
stop_subquery = (
session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == stop_id)
.subquery()
)
coocs_query = ( coocs_query
.filter( ~ x1.ngram_id.in_(stop_subquery) )
.filter( ~ x2.ngram_id.in_(stop_subquery) )
)
if symmetry_filter:
# 1 filtre tenant en compte de la symétrie
# -> réduit le travail de moitié !!
# -> mais empêchera l'accès direct aux cooccurrences de x2
# -> seront éparpillées: notées dans les x1 qui ont précédé x2
# -> récupération sera plus couteuse via des requêtes OR comme:
# WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
coocs_query = coocs_query.filter(x1.ngram_id < x2.ngram_id)
# ------------
# 2 filtres amont possibles pour réduire combinatoire
# - par exemple 929k lignes => 35k lignes
# - ici sur weight mais dégrade les résultats
# => imaginable sur une autre métrique (cvalue ou tfidf?)
# coocs_query = coocs_query.filter(x1.weight > 1)
# coocs_query = coocs_query.filter(x2.weight > 1)
# ------------
# 3) OUTPUT FILTERS
# ------------------
# threshold
#
coocs_query = coocs_query.having(ucooc > threshold)
# 4) EXECUTE QUERY
# ----------------
# => storage in our matrix structure
matrix = WeightedMatrix(coocs_query.all())
# 5) SAVE
# --------
if our_id:
# use pre-existing id
the_id = our_id
else:
# create the new cooc node
the_cooc = Node(
typename = "COOCCURRENCES",
name = "Coocs (in:%s)" % corpus.name[0:10],
parent_id = corpus.id,
user_id = corpus.user_id,
# saving the parameters of the analysis in the Node JSON
hyperdata = { 'corpus': corpus.id,
'threshold': threshold }
)
session.add(the_cooc)
session.commit()
the_id = the_cooc.id
# ==> save all NodeNgramNgram with link to new cooc node id
matrix.save(the_id)
return the_id
......@@ -32,7 +32,7 @@ def compute_groups(corpus, stoplist_id = None):
stop_ngrams_ids = {}
# we will need the ngrams of the stoplist to filter
if stoplist_id is not None:
for id in session.query(NodeNgram.id).filter(NodeNgram.node_id == stoplist_id).all():
for id in session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id == stoplist_id).all():
stop_ngrams_ids[id[0]] = True
......
from gargantext.models import Node, NodeNgram, NodeNodeNgram
from gargantext.util.db import session, bulk_insert
from sqlalchemy import text
# £TODO
# from gargantext.util.lists import WeightedContextIndex
......@@ -57,19 +57,48 @@ def compute_occurrences_local(corpus):
return occnode.id
def compute_tfidf_local(corpus):
def compute_tfidf(corpus, scope="local"):
"""
Calculates tfidf within the current corpus
"""
# ?? FIXME could we keep the docids somehow from previous computations ??
docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "DOCUMENT")
.subquery()
)
Parameter:
- scope: {"local" or "global"}
"""
# local <=> within this corpus
if scope == "local":
# All docs of this corpus
docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "DOCUMENT")
.subquery()
)
# global <=> within all corpora of this source
elif scope == "global":
this_source_type = corpus.resources()[0]['type']
# all corpora with the same source type
# (we need raw SQL query for postgres JSON operators) (TODO test speed)
same_source_corpora_query = (session
.query(Node.id)
.from_statement(text(
"""
SELECT id FROM nodes
WHERE hyperdata->'resources' @> '[{\"type\"\:%s}]'
""" % this_source_type
))
)
# All docs **in all corpora of the same source**
docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id.in_(same_source_corpora_query))
.filter(Node.typename == "DOCUMENT")
.subquery()
)
# N
total_docs = session.query(docids_subquery).count()
# or perhaps at least do the occurrences right now at the same time
......@@ -93,12 +122,14 @@ def compute_tfidf_local(corpus):
# -------------------------------------------------
# create the new TFIDF-CORPUS node
ltfidf = Node()
ltfidf.typename = "TFIDF-CORPUS"
ltfidf.name = "tfidf (in:%s)" % corpus.id
ltfidf.parent_id = corpus.id
ltfidf.user_id = corpus.user_id
session.add(ltfidf)
tfidf_nd = Node(parent_id = corpus.id, user_id = corpus.user_id)
if scope == "local":
tfidf_nd.typename = "TFIDF-CORPUS"
tfidf_nd.name = "tfidf-c (in:%s)" % corpus.id
elif scope == "global":
tfidf_nd.typename = "TFIDF-GLOBAL"
tfidf_nd.name = "tfidf-g (in type:%s)" % this_source_type
session.add(tfidf_nd)
session.commit()
# reflect that in NodeNodeNgrams
......@@ -106,7 +137,7 @@ def compute_tfidf_local(corpus):
bulk_insert(
NodeNodeNgram,
('node1_id' , 'node2_id', 'ngram_id', 'score'),
((ltfidf.id, corpus.id, ng, tfidfs[ng]) for ng in tfidfs)
((tfidf_nd.id, corpus.id, ng, tfidfs[ng]) for ng in tfidfs)
)
return ltfidf.id
return tfidf_nd.id
from gargantext.util.db import *
from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import *
from gargantext.constants import *
from gargantext.util.analysis.cooccurrences import do_cooc
from gargantext.util.analysis.cooccurrences import do_cooc
from gargantext.models.ngrams import Ngram, NodeNgram,\
NodeNgramNgram, NodeNodeNgram
from gargantext.models import Node, Ngram, NodeNgramNgram, NodeNodeNgram
import numpy as np
import pandas as pd
from collections import defaultdict
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
def specificity(cooc_id=None, corpus=None, limit=100, session=None):
def compute_specificity(corpus, cooc_id, limit=100):
'''
Compute the specificity, simple calculus.
'''
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
.order_by(NodeNgramNgram.score)
.limit(limit)
# no filtering: new choice filter on tfidf before creation
# .order_by(NodeNgramNgram.weight)
# .limit(limit)
)
matrix = defaultdict(lambda : defaultdict(float))
# £TODO re-rename weight => score
for cooccurrence in cooccurrences:
matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score
x = pd.DataFrame(matrix).fillna(0)
x = x / x.sum(axis=1)
xs = x.sum(axis=1)
ys = x.sum(axis=0)
m = ( xs - ys) / (2 * (x.shape[0] - 1))
m = m.sort(inplace=False)
#node = get_or_create_node(nodetype='Specificity',corpus=corpus)
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
nb_ngrams = len(matrix)
d = pd.DataFrame(matrix).fillna(0)
# proba (x/y) ( <= on divise chaque colonne par son total)
d = d / d.sum(axis=0)
# d:Matrix => v: Vector (len = nb_ngrams)
v = d.sum(axis=1)
## d ##
#######
# Grenelle biodiversité kilomètres site élus île
# Grenelle 0 0 4 0 0 0
# biodiversité 0 0 0 0 4 0
# kilomètres 4 0 0 0 4 0
# site 0 0 0 0 4 6
# élus 0 4 4 4 0 0
# île 0 0 0 6 0 0
## d.sum(axis=1) ##
###################
# Grenelle 4
# biodiversité 4
# kilomètres 8
# site 10
# élus 12
# île 6
# résultat temporaire
# -------------------
# pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
# (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
# TODO analyser la cohérence math ET sem de cet indicateur
v.sort_values(inplace=True)
# [ ('biodiversité' , 0.333 ),
# ('Grenelle' , 0.5 ),
# ('île' , 0.599 ),
# ('kilomètres' , 1.333 ),
# ('site' , 1.333 ),
# ('élus' , 1.899 ) ]
# ----------------
# specificity node
node = session.query(Node).filter(
Node.parent_id==corpus_id,
Node.parent_id==corpus.id,
Node.typename == "SPECIFICITY"
).first()
if node == None:
corpus = cache.Node[corpus_id]
user_id = corpus.user_id
node = Node(name="SPECIFICITY", parent_id=corpus_id, user_id=user_id, typename="SPECIFICITY")
node = Node(name="Specif (in:%i)" % corpus.id,
parent_id=corpus.id,
user_id=user_id,
typename="SPECIFICITY")
session.add(node)
session.commit()
data = zip( [node.id for i in range(1,m.shape[0])]
, [corpus.id for i in range(1,m.shape[0])]
, m.index.tolist()
, m.values.tolist()
data = zip( [node.id] * nb_ngrams
, [corpus.id] * nb_ngrams
, v.index.tolist()
, v.values.tolist()
)
session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==node.id).delete()
session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id==node.id).delete()
session.commit()
bulk_insert(NodeNodeNgram, ['nodex_id', 'nodey_id', 'ngram_id', 'score'], [d for d in data])
bulk_insert(NodeNodeNgram, ['node1_id', 'node2_id', 'ngram_id', 'score'], [d for d in data])
return(node.id)
def compute_specificity(corpus,limit=100, session=None):
'''
Computing specificities as NodeNodeNgram.
All workflow is the following:
1) Compute the cooc matrix
2) Compute the specificity score, saving it in database, return its Node
'''
#dbg = DebugTime('Corpus #%d - specificity' % corpus.id)
#list_cvalue = get_or_create_node(nodetype='Cvalue', corpus=corpus)
cooc_id = do_cooc(corpus=corpus, cvalue_id=list_cvalue.id,limit=limit)
specificity(cooc_id=cooc_id,corpus=corpus,limit=limit,session=session)
#dbg.show('specificity')
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_specificity(corpus)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment