Commit 85aeadfb authored by PkSM3's avatar PkSM3

Merge branch 'unstable' of ssh://delanoe.org:1979/gargantext into samuel

parents 273dd6b0 ce193205
# Without this, we couldn't use the Django environment
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext_web.settings")
os.environ.setdefault("DJANGO_HSTORE_GLOBAL_REGISTER", "False")
from admin.utils import PrintException
# database tools
from node import models
from gargantext_web.db import *
from parsing.corpustools import *
#!/bin/bash
git checkout unstable
git checkout testing
git merge unstable
git checkout prod-dev
git merge testing
git checkout prod
git merge prod-dev
git checkout unstable
echo "Push ? (yes)"
read y
if [[ $y == "yes" ]]; then
echo "je push"
git push origin prod prod-dev testing unstable
fi
from env import *
from gargantext_web.db import *
from parsing.corpustools import *
from gargantext_web.views import move_to_trash, empty_trash
def do_empty():
corpus_ids = (session.query(Node.id)
.filter(Node.type_id == cache.NodeType['Corpus'].id)
.all()
)
for corpus_id in corpus_ids :
doc_count = int()
doc_count = (session.query(Node.id)
.filter(Node.parent_id == corpus_id)
.filter(Node.type_id == cache.NodeType['Document'].id)
.count()
)
if doc_count == 0 :
move_to_trash(corpus_id)
empty_trash()
do_empty()
def extract_again():
corpus_ids = (session.query(Node.id)
.join(Node_Resource, Node_Resource.node_id == Node.id)
.join(Resource, Node_Resource.resource_id == Resource.id )
.join(or_(Resource.name == 'Europress (French)',
Resource.name == 'Europress (English)'))
.filter(Node.type_id == cache.NodeType['Corpus'].id )
.filter(Node.resource_id == cache.NodeType['Corpus'].id)
.all()
)
print(corpus_ids)
extract_again()
#add_resource(corpus,
# # file = './data_samples/pubmed_result.xml',
# file = '/srv/gargantext_lib/data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
# type_id = cache.ResourceType['pubmed'].id,
#)
#parse_resources(corpus)
#extract_ngrams(corpus, ('title', ))
#
#
#
## print(corpus)
## corpus = session.query(Node).filter(Node.id == 72771).first()
## corpus = session.query(Node).filter(Node.id == 73017).first()
# compute_tfidf(corpus)
......@@ -28,3 +28,4 @@ def PrintException():
line = linecache.getline(filename, lineno, f.f_globals)
print('EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj))
from env import *
from admin.utils import PrintException
from gargantext_web.db import NodeNgram
from gargantext_web.db import *
from parsing.corpustools import *
import sqlalchemy
from sqlalchemy.sql import func
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column
from sqlalchemy.orm import aliased
# from gargantext_web.db import Node, get_cursor
def cooccurrences(user_id=None, corpus_id=None,
mainlist_id=None, stoplist_id=None,
lem=False, stem=True, cvalue=False,
date_begin=None, date_end=None,
size=10, n_min=2, n_max=3):
'''
Function to create a cooccurrence Node
---------------------------------------------------
cooccurrences :: [Text] -> [Word] -> [[Word]]
user_id :: Integer, User.id who creates the cooccurrence matrix
corpus_id :: Integer, Node.id with NodeType "Corpus"
miamlist_id :: Integer, Node.id with NodeType "MiamList" and with parent_id=corpus_id
stoplist_id :: Integer, Node.id with NodeType "StopList" and with parent_id=corpus_id
mainlist_id :: Integer, Node.id with NodeType "MainList" and with parent_id=corpus_id
lem :: False | True, if lemmatization should be taken into account
stem :: False | True, if stemmatization should be taken into account
cvalue :: False | True, if cvalue should be taken into account
group :: False | True, if manual groups should be taken into account
date_begin :: Datetime, format YYYY-MM-DD, begin of corpus splitted by date
date_end :: Datetime, format YYYY-MM-DD, end of corpus splitted by date
size :: Integer, size of the cooccurrence list
n_min :: Integer, minimal ngram's size of n
n_max :: Integer, maximal ngram's size of n
'''
# We create a new node of Type cooccurrence
if corpus_id is not None and user_id is not None:
node_cooc = session.query(Node).filter(
Node.parent_id==corpus.id,
Node.type_id == cache.NodeType['Cooccurrence'].id
).first()
if node_cooc is None:
node_cooc = Node(user_id = user_id,
parent_id=corpus_id,
type_id=cache.NodeType['Cooccurrence'].id,
name="Cooccurrences corpus " + str(corpus_id))
session.add(node_cooc)
session.commit()
else:
print("Usage (Warning): Need corpus_id and user_id")
# Getting the main lists here, by default create or take the first one.
# Getting nodes for lems, stems and cvalue, if needed.
if stem is True:
node_stem = session.query(Node).filter(
Node.type_id==cache.NodeType['Stem'].id).first()
miamNgram = aliased(NodeNgram)
stopNgram = aliased(NodeNgram)
groupNgram = aliased(NodeNgramNgram)
stemNgram = aliased(NodeNgramNgram)
lemNgram = aliased(NodeNgramNgram)
cvalueNgram = aliased(NodeNgramNgram)
# Literal query here
query = (session.query(Node.id, Ngram.id.label('x'), Ngram.id.label('y'), func.count().label('score'))
.join(NodeNgram, NodeNgram.node_id == Node.id)
#.outerjoin(stopNgram, stopNgram.ngram_id == Ngram.id)
.filter(Node.parent_id == corpus_id)
.filter(Node.type_id == cache.NodeType['Document'].id)
#.filter(Ngram.n > n_max)
#.group_by(x)
#.group_by(y)
#.limit(size)
.all()
)
return(query)
......@@ -17,23 +17,23 @@ def create_blacklist(user, corpus):
def create_synonymes(user, corpus):
pass
size = 1000
size = 1000
def create_whitelist(user, corpus_id, size=size, count_min=2):
cursor = connection.cursor()
whitelist_type_id = cache.NodeType['WhiteList'].id
blacklist_type_id = cache.NodeType['BlackList'].id
type_document_id = cache.NodeType['Document'].id
white_list = Node(name='WhiteList Corpus ' + str(corpus_id), user_id=user.id, parent_id=corpus_id, type_id=whitelist_type_id)
black_list = Node(name='BlackList Corpus ' + str(corpus_id), user_id=user.id, parent_id=corpus_id, type_id=blacklist_type_id)
session.add(white_list)
session.add(black_list)
session.commit()
# delete avant pour éviter les doublons
# try:
......@@ -105,21 +105,21 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start
COUNT(*) AS score
FROM
node_node AS n -- the nodes who are direct children of the corpus
INNER JOIN
node_node_ngram AS nngX ON nngX.node_id = n.id -- list of ngrams contained in the node
INNER JOIN
node_node_ngram AS whitelistX ON whitelistX.ngram_id = nngX.ngram_id -- list of ngrams contained in the whitelist and in the node
INNER JOIN
node_ngram AS ngX ON ngX.id = whitelistX.ngram_id -- ngrams which are in both
INNER JOIN
node_node_ngram AS nngY ON nngY.node_id = n.id
INNER JOIN
node_node_ngram AS whitelistY ON whitelistY.ngram_id = nngY.ngram_id
INNER JOIN
node_ngram AS ngY ON ngY.id = whitelistY.ngram_id
WHERE
n.parent_id = %s
AND
......@@ -128,13 +128,13 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start
whitelistY.node_id = %s
AND
nngX.ngram_id < nngY.ngram_id -- so we only get distinct pairs of ngrams
GROUP BY
ngX.id,
ngX.terms,
ngY.id,
ngY.terms
ORDER BY
score DESC
LIMIT
......@@ -153,9 +153,9 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
import networkx as nx
from networkx.readwrite import json_graph
from gargantext_web.api import JsonHttpResponse
from analysis.louvain import best_partition
#print(corpus_id, cooc_id)
try:
......@@ -172,7 +172,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=size)
else:
cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
......@@ -192,41 +192,41 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
x = pd.DataFrame(matrix).fillna(0)
y = pd.DataFrame(matrix).fillna(0)
#xo = diag_null(x)
#y = diag_null(y)
x = x / x.sum(axis=1)
y = y / y.sum(axis=0)
#print(x)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific
m = ( xs - ys) / (2 * (x.shape[0] - 1))
n = n.sort(inplace=False)
m = m.sort(inplace=False)
print(n)
print(m)
nodes_included = 300 #int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific = 300 #int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO user the included score for the node size
n_index = pd.Index.intersection(x.index, n.index[:nodes_included])
# Generic:
# Generic:
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific:
# Specific:
m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
x_index = pd.Index.union(n_index, m_index)
xx = x[list(x_index)].T[list(x_index)]
......@@ -236,26 +236,28 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
# Removing unconnected nodes
xxx = xx.values
threshold = min(xxx.max(axis=1))
matrix_filtered = np.where(xxx > threshold, xxx, 0)
matrix_filtered = np.where(xxx >= threshold, xxx, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
except:
PrintException()
try:
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.from_numpy_matrix(matrix_filtered, create_using=nx.MultiDiGraph())
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(xx.columns)])))
#print(G)
# Removing too connected nodes (find automatic way to do it)
#edges_to_remove = [ e for e in G.edges_iter() if
degree = G.degree()
to_remove = [n for n in degree if degree[n] <= 1]
G.remove_nodes_from(to_remove)
nodes_to_remove = [n for n in degree if degree[n] <= 1]
G.remove_nodes_from(nodes_to_remove)
partition = best_partition(G)
except:
PrintException()
if type == "node_link":
......@@ -270,7 +272,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
# G.add_edge(node, "cluster " + str(partition[node]), weight=3)
except Exception as error:
print("error01: ",error)
data = json_graph.node_link_data(G)
links = []
......@@ -285,7 +287,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
# print(data)
data["links"] = []
data["links"] = links
elif type == "adjacency":
for node in G.nodes():
try:
......@@ -298,7 +300,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
except Exception as error:
print("error02: ",error)
data = json_graph.node_link_data(G)
# data = json_graph.node_link_data(G, attrs={\
# 'source':'source',\
......
......@@ -30,8 +30,8 @@ Install the requirements
5) Type: deactivate
In PostreSQL
-------------
In PostreSQL version 9.4 needed
-------------------------------
1) Ensure postgres is started: sudo /etc/init.d/postgresql start
......@@ -48,9 +48,8 @@ In PostreSQL
7) psql gargandb
6) CREATE EXTENSION hstore;
8) Ctrl + D
7) Ctrl + D
Populate the database
......@@ -58,6 +57,8 @@ Populate the database
python manage.py syncdb
run as postgres or gargantua user:
psql -d gargandb -f /srv/gargantext/init/sql/changeDateformat.sql
Last steps of configuration
---------------------------
......@@ -65,65 +66,25 @@ Last steps of configuration
1) If your project is not in /srv/gargantext:
ln -s [the project folder] /srv/gargantext
2) build gargantext_lib:
cd /srv/
wget http://docs.delanoe.org/gargantext_lib.tar.bz2
sudo tar xvjf gargantext_lib.tar.bz2
sudo chown user:user /srv/gargantext_lib
2) Install de Libraries
cd /srv
wget http://dl.gargantext.org/gargantext_lib.tar.bz2
tar xvjf gargantext_lib.tar.bz2
rm gargantext_lib.tar.bz2
3) Explorer:
cd /srv/gargantext_lib/js
git clone git@github.com:PkSM3/garg.git
4) Adapt all symlinks:
ln -s [your folder for tree tagger] [the project folder]/parsing/Tagger/treetagger
Warning: for ln, path has to be absolute!
3) init nodetypes and main variables
/srv/gargantext/manage.py shell < /srv/gargantext/init/init.py
5) patch CTE:
4) patch CTE:
patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/patches/cte_tree.models.diff
6) init nodetypes and main variables
/srv/gargantext/manage.py shell < /srv/gargantext/init/init.py
7) DO NOT use the default aldjemy package:
5) DO NOT use the default aldjemy package:
cd /tmp
git clone https://github.com/mathieurodic/aldjemy
cd aldjemy
python3 setup.py install
Extras
=======
Last steps of configuration:
----------------------------
1) If your project is not in /srv/gargantext:
ln -s [the project folder] /srv/gargantext
2) build gargantext_lib
wget http://docs.delanoe.org/gargantext_lib.tar.bz2
cd /srv/
sudo tar xvjf gargantext_lib.tar.bz2
sudo chown user:user /srv/gargantext_lib
3) Explorer:
create mkdir /srv/gargantext_lib/js
sudo chown -R user:user /srv/gargantext_lib/
cd /srv/gargantext_lib/js
git clone git@github.com:PkSM3/garg.git
4) Adapt all symlinks:
ln -s [your folder for tree tagger] [the project folder]/parsing/Tagger/treetagger
Warning: for ln, path has to be absolute!
5) patch CTE
patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/cte_tree.models.diff
6) init nodetypes and main variables
/srv/gargantext/manage.py shell < /srv/gargantext/init/init.py
Start Turbo parser server
-------------------------
......
......@@ -82,9 +82,8 @@ print('Initialize node types...')
node_types = [
'Root', 'Trash',
'Project', 'Corpus', 'Document',
'Stem', 'Lem', 'Tfidf',
'Synonym',
'MiamList', 'StopList',
'MiamList', 'StopList', 'MainList',
'Stem', 'Lem', 'Group', 'Tfidf',
'Cooccurrence', 'WhiteList', 'BlackList'
]
......@@ -93,6 +92,20 @@ for node_type in node_types:
# Integration: resource types
print('Initialize users...')
me = session.query(User).filter(User.username=='alexandre').first()
gargantua = session.query(User).filter(User.username=='gargantua').first()
node_root = Node(user_id=gargantua.id, type_id=cache.NodeType['Root'].id, name='Root')
node_stem = Node(user_id=gargantua.id, type_id=cache.NodeType['Stem'].id, name='Stem', parent_id=node_root.id)
node_lem = Node(user_id=gargantua.id, type_id=cache.NodeType['Lem'].id, name='Lem', parent_id=node_root.id)
session.add(node_root)
session.add(node_stem)
session.add(node_lem)
session.commit()
print('Initialize resource...')
from parsing.parsers_config import parsers
......
---- BASIQUE calcul des cooccurrences en ne tenant pas compte des stems équivalents
--
-- SELECT
-- -- %d as node_id,
-- ngX.id,
-- ngY.id,
-- COUNT(*) AS score
--FROM
-- node_node AS n -- the nodes who are direct children of the corpus
--
--INNER JOIN
-- node_node_ngram AS nngX ON nngX.node_id = n.id -- list of ngrams contained in the node
--INNER JOIN
-- node_node_ngram AS mainlistX ON mainlistX.ngram_id = nngX.ngram_id -- list of ngrams contained in the mainlist and in the node
--INNER JOIN
-- node_ngram AS ngX ON ngX.id = mainlistX.ngram_id -- ngrams which are in both
--
--INNER JOIN
-- node_node_ngram AS nngY ON nngY.node_id = n.id
--INNER JOIN
-- node_node_ngram AS mainlistY ON mainlistY.ngram_id = nngY.ngram_id
--INNER JOIN
-- node_ngram AS ngY ON ngY.id = mainlistY.ngram_id
--
--WHERE
-- n.parent_id = 1298
--AND
-- n.type_id = 5
--AND
-- mainlistX.node_id = 1382
--AND
-- mainlistY.node_id = 1382
--AND
-- nngX.ngram_id < nngY.ngram_id -- so we only get distinct pairs of ngrams
--
--GROUP BY
-- ngX.id,
-- ngX.terms,
-- ngY.id,
-- ngY.terms
--
--ORDER BY score DESC
--LIMIT 3
--;
--
-- calcul des cooccurrences en tenant compte des stems équivalents
SELECT
-- %d as node_id,
ngX.id,
ngY.id,
COUNT(*) AS score
FROM
node_node AS n -- the nodes who are direct children of the corpus
INNER JOIN
node_node_ngram AS nngX ON nngX.node_id = n.id -- list of ngrams contained in the node
INNER JOIN
node_node_ngram AS mainlistX ON mainlistX.ngram_id = nngX.ngram_id -- list of ngrams contained in the mainlist and in the node
INNER JOIN
node_ngram AS ngX ON ngX.id = mainlistX.ngram_id -- ngrams which are in both
LEFT JOIN
node_nodengramngram AS nggXX ON nggXX.node_id = 94
AND nggXX.ngramx_id = ngX.id
LEFT JOIN
node_nodengramngram AS nggXY ON nggXY.node_id = 94
AND nggXY.ngramy_id = nggXY.ngramy_id
AND nggXY.ngramx_id < nggXY.ngramx_id
INNER JOIN
node_node_ngram AS nngY ON nngY.node_id = n.id
INNER JOIN
node_node_ngram AS mainlistY ON mainlistY.ngram_id = nngY.ngram_id
INNER JOIN
node_ngram AS ngY ON ngY.id = mainlistY.ngram_id
LEFT JOIN
node_nodengramngram AS nggYX ON nggYX.node_id = 94
AND nggYX.ngramx_id = ngY.id
LEFT JOIN
node_nodengramngram AS nggYY ON nggYY.node_id = 94
AND nggYX.ngramy_id = nggYY.ngramy_id
AND nggYX.ngramx_id < nggYY.ngramx_id
WHERE
n.parent_id = 1298
AND
n.type_id = 5
AND
mainlistX.node_id = 1382
AND
mainlistY.node_id = 1382
AND
nngX.ngram_id < nngY.ngram_id -- so we only get distinct pairs of ngrams
--AND
-- nggYY.id is NULL
--AND
-- nggXY.id is NULL
GROUP BY
ngX.id,
ngX.terms,
ngY.id,
ngY.terms
ORDER BY score DESC
LIMIT 3
;
-- select tous les ngrams distincts de la miam list
SELECT count(*) FROM
(
SELECT ngram_id FROM node_node_ngram
WHERE node_id = 1380 --> node.id de la miam list
GROUP BY ngram_id
) as global
;
-- select tous les ngrams d'un corpus ayant un stem
SELECT count(*) FROM
(
SELECT ngramx_id FROM node_nodengramngram as ng
INNER JOIN node_node_ngram as nn
ON nn.ngram_id = ng.ngramx_id
INNER JOIN node_node as n
ON n.id = nn.node_id
AND n.parent_id = 1298 --> node.id du corpus
WHERE ng.node_id = 94 --> node.id de la stem list
GROUP BY ng.ngramx_id
) as global
;
--- select uniquement tous les ngrams distincts qui ont des stems équivalents
-- LEFT JOIN inclusif des ngrams qui on un stem
-- LEFT JOIN exclusif des ngrams qui on un stem en commun
select count(*) from
(
SELECT ngram_id FROM node_node_ngram as nn
INNER JOIN node_node as n
ON nn.node_id = n.id
AND n.parent_id = 1298 --> node.id du corpus
LEFT JOIN node_nodengramngram AS nx
ON nx.node_id = 94 --> node.id Stem
AND nx.ngramx_id = nn.ngram_id
LEFT JOIN node_nodengramngram AS ny
ON nx.ngramy_id = ny.ngramy_id
AND nx.node_id = 94 --> node.id Stem
AND nx.ngramx_id < ny.ngramx_id --> pour supprimer les doublons
WHERE nn.node_id = 1380 --> node.id de la miam list
-- AND ny.id is NULL
GROUP BY nn.ngram_id, nx.ngramx_id --, ny.ngramx_id
) as global
;
......@@ -35,7 +35,6 @@ extract_ngrams(corpus, ('title', ))
# print(corpus)
# corpus = session.query(Node).filter(Node.id == 72771).first()
# corpus = session.query(Node).filter(Node.id == 73017).first()
compute_tfidf(corpus)
import sys
from admin.utils import PrintException
from gargantext_web.db import NodeNgram
from gargantext_web.db import *
from parsing.corpustools import *
import sqlalchemy
from sqlalchemy.sql import func
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column
from sqlalchemy.orm import aliased
# from gargantext_web.db import Node, get_cursor
def nodeList(user_id=None, corpus_id=None, typeList='MiamList'):
'''
nodeList : get or create NodeList.
nodeList :: Integer -> Integer -> String -> [Node]
user_id :: Integer
corpus_id :: Integer
typeList :: String, Type of the Node that should be created
[Node] :: List of Int, returned or created by the function
'''
if corpus_id is not None and user_id is not None:
# Nodes are either in root_list or user_list
root_list = ['Stem', 'Lem']
user_list = ['MiamList', 'StopList', 'MainList']
if typeList in user_list:
nodes = session.query(Node).filter(
Node.user_id == user_id,
Node.parent_id==corpus_id,
Node.type_id == cache.NodeType[typeList].id
).order_by(desc(Node.id)).all()
elif typeList in root_list:
nodes = session.query(Node).filter(
Node.type_id == cache.NodeType[typeList].id
).order_by(desc(Node.id)).all()
else:
print('typeList not supported yet')
sys.exit(0)
if nodes == []:
node = Node(user_id = user_id,
parent_id=corpus_id,
type_id=cache.NodeType[typeList].id,
name="First default Node " + str(typeList))
session.add(node)
session.commit()
return([(node.id, node.name),])
else:
return([(node.id, node.name) for node in nodes])
else:
print("Usage (Warning): Need corpus_id and user_id")
def stopList(user_id=None, corpus_id=None,
stop_id=None,
reset=False, limit=None
):
'''
Compute the stopList and returns its Node.id
'''
if stop_id is None:
stop_id = nodeList(user_id=user_id,
corpus_id=corpus_id,
typeList='StopList')
# according to type of corpus, choose the right default stopList
def doList(
type_list='miam',
user_id=None, corpus_id=None,
miam_id=None, stop_id=None, main_id=None,
lem_id=None, stem_id=None, cvalue_id=None, group_id=None,
reset=True, limit=None
):
'''
Compute the miamList and returns its Node.id
miamList = allList - stopList
where:
allList = all Ngrams
stopList = all Stop Ngrams
OR
Compute the mainList : main Forms
mainList = miamList - (stem|lem|group|cvalue) List
where:
group = Words grouped manually by user
stem = equivalent Words which are stemmed (but the main form)
lem = equivalent Words which are lemmatized (but the main form)
cvalue = equivalent N-Words according to C-Value (but the main form)
'''
if type_list not in ['miam', 'main']:
print('Type List supported: \'miam\' or \'main\'')
sys.exit(0)
try:
list_dict = {
'miam' : { 'type' : 'MiamList', 'id' : miam_id},
'stop' : { 'type' : 'StopList', 'id' : stop_id},
}
if 'main' == type_list:
list_dict.update(
{
'main' : { 'type' : 'MainList', 'id' : main_id},
'stem' : { 'type' : 'Stem', 'id' : stem_id},
#'lem' : { 'type' : 'LemList', 'id' : lem_id},
#'group' : { 'type' : 'Group', 'id' : group_id},
}
)
for list_ in list_dict.keys():
if list_dict[list_]['id'] is None:
list_dict[list_]['id'] = nodeList(user_id=user_id,
corpus_id=corpus_id,
typeList=list_dict[list_]['type'])[0][0]
# Delete previous List ?
# By default, miamList is computed each time
if reset is True:
session.query(NodeNgram).filter(
NodeNgram.node_id == list_dict[type_list]['id']
).delete()
except:
PrintException()
stopNgram = aliased(NodeNgram)
if 'miam' == type_list:
query = (session.query(
literal_column(str(list_dict['miam']['id'])).label("node_id"),
Ngram.id,
func.count(),
)
.select_from(Ngram)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, NodeNgram.node_id == Node.id)
.outerjoin(stopNgram,
and_(stopNgram.ngram_id == Ngram.id,
stopNgram.node_id == list_dict['stop']['id']))
.filter(Node.parent_id == corpus_id)
.filter(Node.type_id == cache.NodeType['Document'].id)
.filter(stopNgram.id == None )
.group_by(Ngram.id)
)
elif 'main' == type_list:
# Query to get Ngrams for main list
query = (session.query(
literal_column(str(list_dict['main']['id'])).label("node_id"),
Ngram.id,
func.count(),
)
.select_from(Ngram)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.filter(NodeNgram.node_id == list_dict['miam']['id'])
)
if stem_id is not None:
# Query with Stems Result need to be checked before prod
snn1 = aliased(NodeNgramNgram)
snn2 = aliased(NodeNgramNgram)
query = (query.outerjoin(snn1,
and_(snn1.ngramx_id == Ngram.id,
snn1.node_id == list_dict['stem']['id']
)
)
.outerjoin(snn2,
and_(snn1.ngramy_id == snn2.ngramy_id,
snn2.node_id == list_dict['stem']['id'],
snn1.ngramx_id < snn2.ngramx_id
)
)
.filter(snn2.id == None)
)
# Specific group by:
if stem_id is not None:
query = query.group_by(Ngram.id, snn1.ngramx_id)
else:
query = query.group_by(Ngram.id)
# here add filter for size of the ngram
# Order result by occurrences descending
query = query.order_by(desc(func.count()))
# Adding specific filters
if limit is not None:
query = query.limit(limit)
else:
query = query.all()
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], query)
return(list_dict[type_list]['id'])
from admin.utils import PrintException
from gargantext_web.db import *
from parsing.corpustools import *
from gargantext_web.db import NodeNgram
from sqlalchemy import and_
from gargantext_web.db import get_cursor, bulk_insert
def get_ngramogram(corpus, limit=None):
"""
Ngram is a composition of ograms (ogram = 1gram)
"""
try:
query = (session
.query(Ngram.id, Ngram.terms)
.outerjoin(NgramNgram, NgramNgram.ngram_id == Ngram.id)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, NodeNgram.node_id == Node.id)
.filter(Node.parent_id == corpus.id, Node.type_id == cache.NodeType['Document'].id)
.filter(Ngram.n > 1)
.filter(NgramNgram.id == None)
.group_by(Ngram.id, Ngram.terms)
)
#print(str(query))
if isinstance(limit, (int,)):
query = query.limit(limit)
return(query.all())
except Exception as error:
PrintException()
def split_ngram(ngram):
if isinstance(ngram, str):
count = 0
result = list()
ngram_splitted = ngram.split(' ')
for x in ngram_splitted:
if count <= len(ngram_splitted):
result.append((ngram_splitted[count], count))
count += 1
return(result)
else:
print("Parameter should be a string.")
def insert_ngramngram(ngramngram):
ngrams = list()
for n in ngramngram:
for i in split_ngram(n[1]):
ngrams.append((n[0], i[0], 1, i[1]))
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__ngram (
id INT,
ngram_id INT,
terms VARCHAR(255) NOT NULL,
terms_id INT,
n INT,
position INT
);
''')
bulk_insert('tmp__ngram', ['ngram_id', 'terms', 'n', 'position'], ngrams, cursor=cursor)
cursor.execute('''
UPDATE
tmp__ngram
SET
terms_id = ngram.id
FROM
%s AS ngram
WHERE
tmp__ngram.terms = ngram.terms
''' % (Ngram.__table__.name,))
cursor.execute('''
INSERT INTO
%s (n, terms)
SELECT
n, terms
FROM
tmp__ngram
WHERE
terms_id IS NULL
''' % (Ngram.__table__.name,))
cursor.execute('''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngram.terms
AND
tmp__ngram.id IS NULL
''' % (Ngram.__table__.name,))
ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngram')
for row in cursor.fetchall():
ngram_ids[row[1]] = row[0]
db.commit()
return(ngram_ids)
return(result)
def get_ngrams(corpus, unstemmed=True, unlemmatized=False, n=1, limit=None, count_all=False):
'''
Node with NodeType 'Stem' should be created at the root of the project.
'''
if unstemmed is True:
node_ = session.query(Node).filter(Node.type_id == cache.NodeType['Stem'].id).first()
try:
query = (session
.query(Ngram.id, Ngram.terms)
.outerjoin(NodeNgramNgram, and_(
NodeNgramNgram.ngramx_id == Ngram.id,
NodeNgramNgram.node_id==node_.id)
)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, NodeNgram.node_id == Node.id)
.filter(Node.parent_id == corpus.id, Node.type_id == cache.NodeType['Document'].id)
.filter(NodeNgramNgram.id == None)
.filter(Ngram.n == n)
.group_by(Ngram.id, Ngram.terms)
)
#print(str(query))
if isinstance(limit, (int,)):
query = query.limit(limit)
if count_all is True:
return(query.count())
else:
return(query.all())
except Exception as error:
print("Error Query:", error)
def get_stems(corpus, n=1, limit=None,
node_stem=session.query(Node).filter(
Node.type_id==cache.NodeType['Stem'].id).first()):
'''
get_stems :: Corpus -> [Stem]
'''
result = set()
if corpus.language_id is None or corpus.language_id == cache.Language['en'].id:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
#stemmer.stem('honeybees')
elif corpus.language_id == cache.Language['fr'].id:
from nltk.stem.snowball import FrenchStemmer
stemmer = FrenchStemmer()
#stemmer.stem('abeilles')
for ngram_id, word in get_ngrams(corpus, limit=limit, n=n):
result.add((node_stem.id, ngram_id, stemmer.stem(word), n))
return(result)
def get_lems(corpus, n=1, limit=None, node_stem=cache.Node['Lem']):
'''
get_stems :: Corpus -> [Lem]
'''
result = set()
if corpus.language_id is None or corpus.language_id == cache.Language['en'].id:
from nltk.wordnet import PorterStemmer
stemmer = PorterStemmer()
#stemmer.stem('honeybees')
elif corpus.language_id == cache.Language['fr'].id:
from nltk.stem.snowball import FrenchStemmer
stemmer = FrenchStemmer()
#stemmer.stem('abeilles')
for ngram_id, word in get_ngrams(corpus, limit=limit, n=n):
result.add((node_stem.id, ngram_id, stemmer.stem(word), n))
return(result)
def insert_ngrams(stems):
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__ngram (
id INT,
terms VARCHAR(255) NOT NULL,
n INT
);
''')
bulk_insert('tmp__ngram', ['terms', 'n'], stems, cursor=cursor)
cursor.execute('''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%s AS ngram
WHERE
tmp__ngram.terms = ngram.terms
''' % (Ngram.__table__.name,))
cursor.execute('''
INSERT INTO
%s (n, terms)
SELECT
n, terms
FROM
tmp__ngram
WHERE
id IS NULL
''' % (Ngram.__table__.name,))
cursor.execute('''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngram.terms
AND
tmp__ngram.id IS NULL
''' % (Ngram.__table__.name,))
ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngram')
for row in cursor.fetchall():
ngram_ids[row[1]] = row[0]
db.commit()
return(ngram_ids)
def insert_nodengramstem(node_ngram_stem):
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__nnn (
id INT,
node_id INT,
ngramx_id INT,
ngramy_id INT
);
''')
bulk_insert('tmp__nnn',
['node_id', 'ngramx_id', 'ngramy_id'],
node_ngram_stem, cursor=cursor)
# nnn = NodeNgramNgram
cursor.execute('''
UPDATE
tmp__nnn
SET
id = nnn.id
FROM
%s AS nnn
WHERE
tmp__nnn.node_id = nnn.node_id
AND
tmp__nnn.ngramx_id = nnn.ngramx_id
AND
tmp__nnn.ngramy_id = nnn.ngramy_id
''' % (NodeNgramNgram.__table__.name,))
cursor.execute('''
INSERT INTO
%s (node_id, ngramx_id, ngramy_id, score)
SELECT
node_id, ngramx_id, ngramy_id, 1
FROM
tmp__nnn
WHERE
id is NULL
''' % (NodeNgramNgram.__table__.name,))
db.commit()
def stem_corpus(corpus_id=None):
'''
Returns Int as id of the Stem Node
stem_corpus :: Int
'''
corpus = session.query(Node).filter(Node.id == corpus_id).first()
print('Number of new ngrams to stem:',
get_ngrams(corpus, n=2, count_all=True))
if corpus is not None:
try:
result = get_stems(corpus, n=2)
stems = set([(stem[2], stem[3]) for stem in result])
print('Number of new stems', len(stems))
stem_ids = insert_ngrams(stems)
node_ngram_stem = set([ (ngram[0],
ngram[1],
stem_ids[ngram[2]]
) for ngram in list(result) ])
print(list(node_ngram_stem)[:3])
insert_nodengramstem(node_ngram_stem)
except:
PrintException()
else:
print('Usage: stem_corpus(corpus_id=corpus.id)')
# Without this, we couldn't use the Django environment
from admin.env import *
from ngram.stemLem import *
from ngram.lists import *
#from cooccurrences import *
#from gargantext_web.views import empty_trash
#empty_trash()
#
#user = session.query(User).all()[0]
user = session.query(User).filter(User.username=='alexandre').first()
print('Current user is:', user.username)
project = session.query(Node).filter(Node.name == 'Test').first()
if project is None:
project = Node(
name = 'Test',
type_id = cache.NodeType['Project'].id,
user_id = user.id
)
session.add(project)
session.commit()
#corpora = session.query(Node).filter(Node.parent_id == project.id,
# Node.type_id == cache.NodeType['Corpus'].id
# ).delete()
#
#models.Node.objects(parent_id = project.id, type_id = cache.NodeType['Corpus']).all().delete()
#
corpus = session.query(Node).filter(Node.parent_id == project.id,
Node.type_id == cache.NodeType['Corpus'].id).first()
if corpus is None:
corpus = Node(
parent_id = project.id,
name = 'Test Corpus',
type_id = cache.NodeType['Corpus'].id,
user_id = user.id
)
session.add(corpus)
session.commit()
add_resource(corpus,
file = '/srv/gargantext_lib/data_samples/pubmed.zip',
# #file = '/srv/gargantext_lib/data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
type_id = cache.ResourceType['Pubmed (xml format)'].id,
)
parse_resources(corpus)
extract_ngrams(corpus, ('title', 'abstract'))
compute_tfidf(corpus)
# Stemming the corpus
print('Working on corpus:', corpus.id, corpus.name)
stem_id = stem_corpus(corpus_id=corpus.id)
print('Stem Node.id is', stem_id)
for typeList in ['MiamList', 'StopList', 'MainList', 'Stem']:
n = nodeList(user_id=user.id,
corpus_id=corpus.id,
typeList=typeList)
print(n)
type_list='miam'
try:
d = doList(type_list=type_list, user_id = user.id, corpus_id = corpus.id, stem_id=stem_id, limit=150)
print('Size of the ' + type_list + ' list:',
session.query(NodeNgram).filter(NodeNgram.node_id == d).count()
)
except:
PrintException()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment