Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
3476d4a2
Commit
3476d4a2
authored
Jan 21, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FIX SESSIONS] local session for workflow and get_or_create_node are renamed to mysession.
parent
c54058e2
Changes
16
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
138 additions
and
212 deletions
+138
-212
utils.py
admin/utils.py
+3
-4
cooccurrences.py
analysis/cooccurrences.py
+14
-11
functions.py
analysis/functions.py
+3
-3
periods.py
analysis/periods.py
+2
-2
celery.py
gargantext_web/celery.py
+13
-19
db.py
gargantext_web/db.py
+10
-13
cvalue.py
ngram/cvalue.py
+3
-5
group.py
ngram/group.py
+9
-14
mapList.py
ngram/mapList.py
+15
-26
occurrences.py
ngram/occurrences.py
+4
-9
specificity.py
ngram/specificity.py
+9
-20
stop.py
ngram/stop.py
+6
-11
tfidf.py
ngram/tfidf.py
+6
-22
tools.py
ngram/tools.py
+2
-2
workflow.py
ngram/workflow.py
+15
-15
corpustools.py
parsing/corpustools.py
+24
-36
No files found.
admin/utils.py
View file @
3476d4a2
...
...
@@ -47,13 +47,12 @@ def PrintException():
class
WorkflowTracking
:
def
__init__
(
self
):
self
.
hola
=
"mundo"
def
processing_
(
self
,
corpus
,
step
):
def
processing_
(
self
,
corpus
_id
,
step
):
try
:
the_query
=
""" UPDATE node_node SET hyperdata=
\'
{
\"
%
s
\"
:
\"
%
s
\"
}
\'
WHERE id=
%
d """
%
(
"Processing"
,
step
,
corpus
.
id
)
the_query
=
""" UPDATE node_node SET hyperdata=
\'
{
\"
%
s
\"
:
\"
%
s
\"
}
\'
WHERE id=
%
d """
%
(
"Processing"
,
step
,
corpus
_
id
)
cursor
=
connection
.
cursor
()
try
:
cursor
.
execute
(
the_query
)
...
...
analysis/cooccurrences.py
View file @
3476d4a2
...
...
@@ -18,7 +18,8 @@ def do_cooc(corpus=None
,
start
=
None
,
end
=
None
,
limit
=
1000
,
isMonopartite
=
True
,
hapax
=
3
):
,
hapax
=
3
,
mysession
=
None
):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
...
...
@@ -40,13 +41,16 @@ def do_cooc(corpus=None
# Security test
field1
,
field2
=
str
(
field1
),
str
(
field2
)
session
=
get_session
()
if
mysession
is
None
:
from
gargantext_web.db
import
session
mysession
=
session
# Get node
node_cooc
=
get_or_create_node
(
nodetype
=
'Cooccurrence'
,
corpus
=
corpus
,
name_str
=
"Cooccurrences corpus "
\
+
str
(
corpus
.
id
)
+
"list_id: "
+
str
(
miam_id
)
#, hyperdata={'field1': field1, 'field2':field2}
,
session
=
session
)
,
mysession
=
my
session
)
# BEGIN
...
...
@@ -60,12 +64,12 @@ def do_cooc(corpus=None
#
# node_cooc.hyperdata = hyperdata
#
session
.
add
(
node_cooc
)
session
.
commit
()
my
session
.
add
(
node_cooc
)
my
session
.
commit
()
# END
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_cooc
.
id
)
.
delete
()
session
.
commit
()
my
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_cooc
.
id
)
.
delete
()
my
session
.
commit
()
doc_id
=
cache
.
NodeType
[
'Document'
]
.
id
...
...
@@ -77,7 +81,7 @@ def do_cooc(corpus=None
if
isMonopartite
:
NodeNgramY
=
aliased
(
NodeNgram
)
cooc_query
=
(
session
.
query
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
,
cooc_score
)
cooc_query
=
(
my
session
.
query
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
,
cooc_score
)
.
join
(
Node
,
Node
.
id
==
NodeNgramX
.
node_id
)
.
join
(
NodeNgramY
,
NodeNgramY
.
node_id
==
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
doc_id
)
...
...
@@ -85,7 +89,7 @@ def do_cooc(corpus=None
else
:
NodeNgramY
=
aliased
(
NodeNgram
)
cooc_query
=
(
session
.
query
(
NodeHyperdataNgram
.
ngram_id
,
NodeNgramY
.
ngram_id
,
cooc_score
)
cooc_query
=
(
my
session
.
query
(
NodeHyperdataNgram
.
ngram_id
,
NodeNgramY
.
ngram_id
,
cooc_score
)
.
join
(
Node
,
Node
.
id
==
NodeHyperdataNgram
.
node_id
)
.
join
(
NodeNgramY
,
NodeNgramY
.
node_id
==
Node
.
id
)
.
join
(
Hyperdata
,
Hyperdata
.
id
==
NodeHyperdataNgram
.
hyperdata_id
)
...
...
@@ -169,7 +173,7 @@ def do_cooc(corpus=None
# Select according some scores
if
cvalue_id
is
not
None
:
#miam = get_or_create_node(nodetype='Cvalue', corpus=corpus)
cvalue_list
=
UnweightedList
(
session
.
query
(
NodeNodeNgram
.
ngram_id
)
cvalue_list
=
UnweightedList
(
my
session
.
query
(
NodeNodeNgram
.
ngram_id
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
cvalue_id
)
.
all
()
)
...
...
@@ -200,4 +204,3 @@ def do_cooc(corpus=None
cooc
=
matrix
cooc
.
save
(
node_cooc
.
id
)
return
(
node_cooc
.
id
)
session
.
remove
()
analysis/functions.py
View file @
3476d4a2
...
...
@@ -44,9 +44,9 @@ def get_cooc(request=None, corpus=None
data
=
{}
#if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print
(
"Cooccurrences do not exist yet, creating it."
)
miam_id
=
get_or_create_node
(
nodetype
=
'MapList'
,
corpus
=
corpus
,
session
=
session
)
.
id
stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
,
session
=
session
)
.
id
group_id
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
,
session
=
session
)
.
id
miam_id
=
get_or_create_node
(
nodetype
=
'MapList'
,
corpus
=
corpus
,
my
session
=
session
)
.
id
stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
,
my
session
=
session
)
.
id
group_id
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
,
my
session
=
session
)
.
id
SamuelFlag
=
False
# if field1 == field2 == 'ngrams' :
...
...
analysis/periods.py
View file @
3476d4a2
...
...
@@ -51,7 +51,7 @@ def periods(corpus, start=None, end=None):
if
duration
.
days
>
365
*
3
:
print
(
"OK"
)
miam_id
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
.
id
miam_id
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
,
mysession
=
session
)
.
id
result_list
=
list
()
for
t
in
times
:
...
...
@@ -86,7 +86,7 @@ def jacquard(period1, period2):
def
get_partition
(
corpus
,
start
=
None
,
end
=
None
,
distance
=
distance
):
session
=
get_session
()
miam_id
=
get_or_create_node
(
corpus
=
corpus
,
nodetype
=
'MapList'
,
session
=
session
)
.
id
miam_id
=
get_or_create_node
(
corpus
=
corpus
,
nodetype
=
'MapList'
,
my
session
=
session
)
.
id
print
(
"get Partition
%
s -
%
s"
%
(
start
,
end
))
cooc_id
=
do_cooc
(
corpus
=
corpus
,
start
=
start
...
...
gargantext_web/celery.py
View file @
3476d4a2
...
...
@@ -3,7 +3,7 @@
from
celery
import
shared_task
from
node
import
models
from
django.db
import
transaction
from
admin.utils
import
DebugTime
from
admin.utils
import
DebugTime
,
PrintException
import
cProfile
#@app.task(bind=True)
...
...
@@ -15,15 +15,8 @@ from gargantext_web.db import get_session, cache, Node
from
ngram.workflow
import
ngram_workflow
@
shared_task
def
apply_sum
(
x
,
y
):
print
(
x
+
y
)
session
=
get_session
()
print
(
session
.
query
(
Node
.
name
)
.
first
())
session
.
remove
()
from
parsing.corpustools
import
parse_resources
,
extract_ngrams
#add_resource,
from
ngram.lists
import
ngrams2miam
#
from ngram.lists import ngrams2miam
from
admin.utils
import
WorkflowTracking
...
...
@@ -36,28 +29,29 @@ def apply_workflow(corpus_id):
update_state
=
WorkflowTracking
()
try
:
session
=
get_session
()
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
my
session
=
get_session
()
corpus
=
my
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
update_state
.
processing_
(
corpus
,
"Parsing"
)
update_state
.
processing_
(
int
(
corpus_id
)
,
"Parsing"
)
#cProfile.runctx('parse_resources(corpus)', global,locals)
parse_resources
(
corpus
,
session
=
session
)
parse_resources
(
corpus
,
mysession
=
my
session
)
update_state
.
processing_
(
corpus
,
"Terms extraction"
)
extract_ngrams
(
corpus
,
[
'title'
,
'abstract'
],
nlp
=
True
,
session
=
session
)
update_state
.
processing_
(
int
(
corpus_id
)
,
"Terms extraction"
)
extract_ngrams
(
corpus
,
[
'title'
,
'abstract'
],
nlp
=
True
,
mysession
=
my
session
)
# update_state.processing_(corpus, "")
ngram_workflow
(
corpus
,
session
=
session
)
ngram_workflow
(
corpus
,
mysession
=
my
session
)
#ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
print
(
"End of the Workflow for corpus
%
d"
%
(
corpus_id
))
update_state
.
processing_
(
corpus
,
"0"
)
update_state
.
processing_
(
int
(
corpus_id
)
,
"0"
)
session
.
remove
()
my
session
.
remove
()
except
Exception
as
error
:
print
(
error
)
session
.
remove
()
PrintException
()
mysession
.
remove
()
@
shared_task
def
empty_trash
(
corpus_id
):
...
...
gargantext_web/db.py
View file @
3476d4a2
...
...
@@ -242,17 +242,16 @@ class bulk_insert:
readline
=
read
def
get_or_create_node
(
nodetype
=
None
,
corpus
=
None
,
corpus_id
=
None
,
name_str
=
None
,
hyperdata
=
None
,
session
=
None
):
def
get_or_create_node
(
nodetype
=
None
,
corpus
=
None
,
corpus_id
=
None
,
name_str
=
None
,
hyperdata
=
None
,
my
session
=
None
):
'''
Should be a method of the object. __get_or_create__ ?
name_str :: String
hyperdata :: Dict
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
if
mysession
is
None
:
from
gargantext_web.db
import
session
mysession
=
session
if
nodetype
is
None
:
print
(
"Need to give a type node"
)
...
...
@@ -262,13 +261,13 @@ def get_or_create_node(nodetype=None,corpus=None,corpus_id=None,name_str=None,hy
except
KeyError
:
ntype
=
cache
.
NodeType
[
nodetype
]
=
NodeType
()
ntype
.
name
=
nodetype
session
.
add
(
ntype
)
session
.
commit
()
my
session
.
add
(
ntype
)
my
session
.
commit
()
if
corpus_id
is
not
None
and
corpus
is
None
:
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
corpus
=
my
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
node
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
ntype
.
id
node
=
(
my
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
ntype
.
id
,
Node
.
parent_id
==
corpus
.
id
,
Node
.
user_id
==
corpus
.
user_id
)
...
...
@@ -289,11 +288,9 @@ def get_or_create_node(nodetype=None,corpus=None,corpus_id=None,name_str=None,hy
node
.
name
=
name_str
else
:
node
.
name
=
ntype
.
name
session
.
add
(
node
)
session
.
commit
()
my
session
.
add
(
node
)
my
session
.
commit
()
#print(parent_id, n.parent_id, n.id, n.name)
return
(
node
)
if
sessionToRemove
:
session
.
remove
()
ngram/cvalue.py
View file @
3476d4a2
...
...
@@ -67,7 +67,7 @@ def getNgrams(corpus=None, limit=1000):
return
(
terms
)
session
.
remove
()
def
compute_cvalue
(
corpus
=
None
,
limit
=
1000
):
def
compute_cvalue
(
corpus
=
None
,
limit
=
1000
,
mysession
=
None
):
'''
computeCvalue :: Corpus
frequency :: String -> Int -> Int
...
...
@@ -126,13 +126,11 @@ def compute_cvalue(corpus=None, limit=1000):
result
=
cvalueAll
()
#print([n for n in result])
session
=
get_session
()
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
cvalue_node
.
id
)
.
delete
()
session
.
commit
()
mysession
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
cvalue_node
.
id
)
.
delete
()
mysession
.
commit
()
#bulk_insert(NodeNodeNgram, ['nodex_id', 'nodey_id', 'ngram_id', 'score'], [n for n in islice(result,0,100)])
bulk_insert
(
NodeNodeNgram
,
[
'nodex_id'
,
'nodey_id'
,
'ngram_id'
,
'score'
],
[
n
for
n
in
result
])
session
.
remove
()
# test
#corpus=session.query(Node).filter(Node.id==244250).first()
#computeCvalue(corpus)
ngram/group.py
View file @
3476d4a2
...
...
@@ -47,14 +47,10 @@ def getStemmer(corpus):
return
(
stemIt
)
def
compute_groups
(
corpus
,
limit_inf
=
None
,
limit_sup
=
None
,
how
=
'Stem'
,
session
=
None
):
def
compute_groups
(
corpus
,
limit_inf
=
None
,
limit_sup
=
None
,
how
=
'Stem'
,
my
session
=
None
):
'''
group ngrams according to a function (stemming or lemming)
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - group'
%
corpus
.
id
)
dbg
.
show
(
'Group'
)
...
...
@@ -66,19 +62,19 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
stemIt
=
getStemmer
(
corpus
)
group_to_insert
=
set
()
node_group
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
,
session
=
session
)
node_group
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
,
mysession
=
my
session
)
miam_to_insert
=
set
()
miam_node
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
,
session
=
session
)
miam_node
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
,
mysession
=
my
session
)
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
,
session
=
session
)
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
,
mysession
=
my
session
)
#stop_list = UnweightedList(stop_node.id)
Stop
=
aliased
(
NodeNgram
)
frequency
=
sa
.
func
.
count
(
NodeNgram
.
weight
)
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
ngrams
=
(
my
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
#.outerjoin(Stop, Stop.ngram_id == Ngram.id)
...
...
@@ -90,7 +86,7 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
.
limit
(
limit_sup
)
)
stops
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
stops
=
(
my
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
join
(
Stop
,
Stop
.
ngram_id
==
Ngram
.
id
)
...
...
@@ -131,10 +127,10 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
miam_to_insert
.
add
((
miam_node
.
id
,
group
[
key
][
'mainForm'
],
1
))
# # Deleting previous groups
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_group
.
id
)
.
delete
()
my
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_group
.
id
)
.
delete
()
# # Deleting previous ngrams miam list
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
miam_node
.
id
)
.
delete
()
session
.
commit
()
my
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
miam_node
.
id
)
.
delete
()
my
session
.
commit
()
bulk_insert
(
NodeNgramNgram
,
(
'node_id'
,
'ngramx_id'
,
'ngramy_id'
,
'score'
)
...
...
@@ -142,4 +138,3 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
bulk_insert
(
NodeNgram
,
(
'node_id'
,
'ngram_id'
,
'weight'
),
[
data
for
data
in
list
(
miam_to_insert
)])
if
sessionToRemove
:
session
.
remove
()
ngram/mapList.py
View file @
3476d4a2
...
...
@@ -15,15 +15,11 @@ from sqlalchemy.orm import aliased
from
ngram.tools
import
insert_ngrams
import
csv
def
compute_mapList
(
corpus
,
limit
=
500
,
n
=
1
,
session
=
None
):
def
compute_mapList
(
corpus
,
limit
=
500
,
n
=
1
,
my
session
=
None
):
'''
According to Specificities and stoplist,
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
monograms_part
=
0.005
monograms_limit
=
round
(
limit
*
monograms_part
)
...
...
@@ -31,11 +27,11 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
node_miam
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
node_stop
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
node_group
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
)
node_miam
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
,
mysession
=
mysession
)
node_stop
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
,
mysession
=
mysession
)
node_group
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
,
mysession
=
mysession
)
node_spec
=
get_or_create_node
(
nodetype
=
'Specificity'
,
corpus
=
corpus
)
node_spec
=
get_or_create_node
(
nodetype
=
'Specificity'
,
corpus
=
corpus
,
mysession
=
mysession
)
Miam
=
aliased
(
NodeNgram
)
Stop
=
aliased
(
NodeNgram
)
...
...
@@ -43,7 +39,7 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
Spec
=
aliased
(
NodeNodeNgram
)
query
=
(
session
.
query
(
Spec
.
ngram_id
,
Spec
.
score
)
query
=
(
my
session
.
query
(
Spec
.
ngram_id
,
Spec
.
score
)
.
join
(
Miam
,
Spec
.
ngram_id
==
Miam
.
ngram_id
)
.
join
(
Ngram
,
Ngram
.
id
==
Spec
.
ngram_id
)
#.outerjoin(Group, Group.ngramy_id == Spec.ngram_id)
...
...
@@ -66,19 +62,19 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
.
limit
(
multigrams_limit
)
)
stop_ngrams
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
stop_ngrams
=
(
my
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
node_stop
.
id
)
.
all
()
)
grouped_ngrams
=
(
session
.
query
(
NodeNgramNgram
.
ngramy_id
)
grouped_ngrams
=
(
my
session
.
query
(
NodeNgramNgram
.
ngramy_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_group
.
id
)
.
all
()
)
node_mapList
=
get_or_create_node
(
nodetype
=
'MapList'
,
corpus
=
corpus
)
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_mapList
.
id
)
.
delete
()
session
.
commit
()
node_mapList
=
get_or_create_node
(
nodetype
=
'MapList'
,
corpus
=
corpus
,
mysession
=
mysession
)
my
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_mapList
.
id
)
.
delete
()
my
session
.
commit
()
data
=
zip
(
[
node_mapList
.
id
for
i
in
range
(
1
,
limit
)]
...
...
@@ -91,20 +87,14 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
dbg
.
show
(
'MapList computed'
)
if
sessionToRemove
:
session
.
remove
()
def
insert_miam
(
corpus
,
ngrams
=
None
,
path_file_csv
=
None
):
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
def
insert_miam
(
corpus
,
ngrams
=
None
,
path_file_csv
=
None
,
mysession
=
None
):
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
node_miam
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_miam
.
id
)
.
delete
()
session
.
commit
()
node_miam
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
,
mysession
=
mysession
)
my
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_miam
.
id
)
.
delete
()
my
session
.
commit
()
stop_words
=
set
()
miam_words
=
set
()
...
...
@@ -133,6 +123,5 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
file_csv
.
close
()
dbg
.
show
(
'Miam computed'
)
if
sessionToRemove
:
session
.
remove
()
ngram/occurrences.py
View file @
3476d4a2
...
...
@@ -4,26 +4,22 @@ from gargantext_web.db import Node, NodeNgram, NodeNodeNgram
from
gargantext_web.db
import
get_or_create_node
from
admin.utils
import
DebugTime
def
compute_occs
(
corpus
,
session
=
None
):
def
compute_occs
(
corpus
,
my
session
=
None
):
'''
compute_occs :: Corpus -> IO ()
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - OCCURRENCES'
%
corpus
.
id
)
dbg
.
show
(
'Calculate occurrences'
)
occs_node
=
get_or_create_node
(
nodetype
=
'Occurrences'
,
corpus
=
corpus
)
occs_node
=
get_or_create_node
(
nodetype
=
'Occurrences'
,
corpus
=
corpus
,
mysession
=
mysession
)
#print(occs_node.id)
(
session
.
query
(
NodeNodeNgram
)
(
my
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
occs_node
.
id
)
.
delete
()
)
session
.
commit
()
my
session
.
commit
()
db
,
cursor
=
get_cursor
()
cursor
.
execute
(
'''
...
...
@@ -55,7 +51,6 @@ def compute_occs(corpus, session=None):
)
)
db
.
commit
()
if
sessionToRemove
:
session
.
remove
()
#data = session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==occs_node.id).all()
...
...
ngram/specificity.py
View file @
3476d4a2
...
...
@@ -15,16 +15,12 @@ from gargantext_web.db import NodeNgramNgram, NodeNodeNgram
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
def
specificity
(
cooc_id
=
None
,
corpus
=
None
,
limit
=
100
,
session
=
None
):
def
specificity
(
cooc_id
=
None
,
corpus
=
None
,
limit
=
100
,
my
session
=
None
):
'''
Compute the specificity, simple calculus.
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
cooccurrences
=
(
my
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
.
order_by
(
NodeNgramNgram
.
score
)
.
limit
(
limit
)
...
...
@@ -45,23 +41,22 @@ def specificity(cooc_id=None, corpus=None, limit=100, session=None):
m
=
(
xs
-
ys
)
/
(
2
*
(
x
.
shape
[
0
]
-
1
))
m
=
m
.
sort
(
inplace
=
False
)
node
=
get_or_create_node
(
nodetype
=
'Specificity'
,
corpus
=
corpus
,
session
=
session
)
node
=
get_or_create_node
(
nodetype
=
'Specificity'
,
corpus
=
corpus
,
mysession
=
my
session
)
data
=
zip
(
[
node
.
id
for
i
in
range
(
1
,
m
.
shape
[
0
])]
,
[
corpus
.
id
for
i
in
range
(
1
,
m
.
shape
[
0
])]
,
m
.
index
.
tolist
()
,
m
.
values
.
tolist
()
)
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
node
.
id
)
.
delete
()
session
.
commit
()
my
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
node
.
id
)
.
delete
()
my
session
.
commit
()
bulk_insert
(
NodeNodeNgram
,
[
'nodex_id'
,
'nodey_id'
,
'ngram_id'
,
'score'
],
[
d
for
d
in
data
])
return
(
node
.
id
)
if
sessionToRemove
:
session
.
remove
()
def
compute_specificity
(
corpus
,
limit
=
100
,
session
=
None
):
def
compute_specificity
(
corpus
,
limit
=
100
,
my
session
=
None
):
'''
Computing specificities as NodeNodeNgram.
All workflow is the following:
...
...
@@ -69,19 +64,13 @@ def compute_specificity(corpus,limit=100, session=None):
2) Compute the specificity score, saving it in database, return its Node
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - specificity'
%
corpus
.
id
)
list_cvalue
=
get_or_create_node
(
nodetype
=
'Cvalue'
,
corpus
=
corpus
,
session
=
session
)
cooc_id
=
do_cooc
(
corpus
=
corpus
,
cvalue_id
=
list_cvalue
.
id
,
limit
=
limit
)
list_cvalue
=
get_or_create_node
(
nodetype
=
'Cvalue'
,
corpus
=
corpus
,
mysession
=
my
session
)
cooc_id
=
do_cooc
(
corpus
=
corpus
,
cvalue_id
=
list_cvalue
.
id
,
limit
=
limit
,
mysession
=
mysession
)
specificity
(
cooc_id
=
cooc_id
,
corpus
=
corpus
,
limit
=
limit
)
specificity
(
cooc_id
=
cooc_id
,
corpus
=
corpus
,
limit
=
limit
,
mysession
=
mysession
)
dbg
.
show
(
'specificity'
)
if
sessionToRemove
:
session
.
remove
()
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_specificity(corpus)
...
...
ngram/stop.py
View file @
3476d4a2
...
...
@@ -75,22 +75,18 @@ def isStopWord(ngram, stop_words=None):
if
test_match
(
word
,
regex
)
is
True
:
return
(
True
)
def
compute_stop
(
corpus
,
limit
=
2000
,
debug
=
False
,
session
=
None
):
def
compute_stop
(
corpus
,
limit
=
2000
,
debug
=
False
,
my
session
=
None
):
'''
do some statitics on all stop lists of database of the same type
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
stop_node_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
.
id
stop_node_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
,
mysession
=
mysession
)
.
id
# TODO do a function to get all stop words with social scores
root
=
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Root'
]
.
id
)
.
first
()
root_stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
root
)
.
id
root
=
my
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Root'
]
.
id
)
.
first
()
root_stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
root
,
mysession
=
mysession
)
.
id
stop_words
=
(
session
.
query
(
Ngram
.
terms
)
stop_words
=
(
my
session
.
query
(
Ngram
.
terms
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
filter
(
NodeNgram
.
node_id
==
root_stop_id
)
.
all
()
...
...
@@ -99,7 +95,7 @@ def compute_stop(corpus,limit=2000,debug=False, session=None):
#print([n for n in stop_words])
frequency
=
sa
.
func
.
count
(
NodeNgram
.
weight
)
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
ngrams
=
(
my
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
...
...
@@ -118,4 +114,3 @@ def compute_stop(corpus,limit=2000,debug=False, session=None):
stop
=
WeightedList
({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
stop
.
save
(
stop_node_id
)
if
sessionToRemove
:
session
.
remove
()
ngram/tfidf.py
View file @
3476d4a2
...
...
@@ -5,17 +5,12 @@ from gargantext_web.db import get_session, get_or_create_node
from
admin.utils
import
DebugTime
def
compute_tfidf
(
corpus
,
session
=
None
):
def
compute_tfidf
(
corpus
,
my
session
=
None
):
# compute terms frequency sum
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - TFIDF'
%
corpus
.
id
)
dbg
.
show
(
'calculate terms frequencies sums'
)
tfidf_node
=
get_or_create_node
(
nodetype
=
'Tfidf'
,
corpus
=
corpus
,
session
=
session
)
tfidf_node
=
get_or_create_node
(
nodetype
=
'Tfidf'
,
corpus
=
corpus
,
mysession
=
my
session
)
db
,
cursor
=
get_cursor
()
cursor
.
execute
(
'''
...
...
@@ -125,26 +120,20 @@ def compute_tfidf(corpus, session=None):
# the end!
db
.
commit
()
if
sessionToRemove
:
session
.
remove
()
def
compute_tfidf_global
(
corpus
,
session
=
None
):
def
compute_tfidf_global
(
corpus
,
my
session
=
None
):
'''
Maybe improve this with:
#http://stackoverflow.com/questions/8674718/best-way-to-select-random-rows-postgresql
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - tfidf global'
%
corpus
.
id
)
dbg
.
show
(
'calculate terms frequencies sums'
)
tfidf_node
=
get_or_create_node
(
nodetype
=
'Tfidf (global)'
,
corpus
=
corpus
,
session
=
session
)
tfidf_node
=
get_or_create_node
(
nodetype
=
'Tfidf (global)'
,
corpus
=
corpus
,
mysession
=
my
session
)
# update would be better
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
tfidf_node
.
id
)
.
delete
()
session
.
commit
()
my
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
tfidf_node
.
id
)
.
delete
()
my
session
.
commit
()
# compute terms frequency sum
db
,
cursor
=
get_cursor
()
...
...
@@ -271,8 +260,3 @@ def compute_tfidf_global(corpus, session=None):
db
.
commit
()
dbg
.
show
(
'insert tfidf'
)
if
sessionToRemove
:
session
.
remove
()
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_tfidf_global(corpus)
ngram/tools.py
View file @
3476d4a2
...
...
@@ -8,8 +8,8 @@ def insert_ngrams_to_list(list_of_ngrams, corpus, list_type='MapList', erase=Tru
'''
session
=
get_session
()
list_node
=
get_or_create_node
(
corpus
=
corpus
,
nodetype
=
list_type
,
session
=
session
)
group_node
=
get_or_create_node
(
corpus
=
corpus
,
nodetype
=
'GroupList'
,
session
=
session
)
list_node
=
get_or_create_node
(
corpus
=
corpus
,
nodetype
=
list_type
,
my
session
=
session
)
group_node
=
get_or_create_node
(
corpus
=
corpus
,
nodetype
=
'GroupList'
,
my
session
=
session
)
group_list
=
(
session
.
query
(
NodeNgramNgram
.
ngramy_id
)
.
filter
(
NodeNgramNgram
.
id
==
group_node
.
id
)
.
all
()
...
...
ngram/workflow.py
View file @
3476d4a2
...
...
@@ -12,17 +12,17 @@ from gargantext_web.db import Node , NodeNgram
from
admin.utils
import
WorkflowTracking
def
ngram_workflow
(
corpus
,
n
=
5000
,
session
=
None
):
def
ngram_workflow
(
corpus
,
n
=
5000
,
my
session
=
None
):
'''
All the workflow to filter the ngrams.
'''
update_state
=
WorkflowTracking
()
update_state
.
processing_
(
corpus
,
"Stop words"
)
compute_stop
(
corpus
,
session
=
session
)
update_state
.
processing_
(
corpus
.
id
,
"Stop words"
)
compute_stop
(
corpus
,
mysession
=
my
session
)
update_state
.
processing_
(
corpus
,
"TF-IDF global score"
)
compute_tfidf_global
(
corpus
,
session
=
session
)
update_state
.
processing_
(
corpus
.
id
,
"TF-IDF global score"
)
compute_tfidf_global
(
corpus
,
mysession
=
my
session
)
part
=
round
(
n
*
0.9
)
...
...
@@ -31,28 +31,28 @@ def ngram_workflow(corpus, n=5000, session=None):
# part = round(part * 0.8)
#print('spec part:', part)
update_state
.
processing_
(
corpus
,
"Specificity score"
)
compute_specificity
(
corpus
,
limit
=
part
,
session
=
session
)
update_state
.
processing_
(
corpus
.
id
,
"Specificity score"
)
compute_specificity
(
corpus
,
limit
=
part
,
mysession
=
my
session
)
part
=
round
(
part
*
0.8
)
limit_inf
=
round
(
part
*
1
)
limit_sup
=
round
(
part
*
5
)
#print(limit_inf,limit_sup)
update_state
.
processing_
(
corpus
,
"Synonyms"
)
update_state
.
processing_
(
corpus
.
id
,
"Synonyms"
)
try
:
compute_groups
(
corpus
,
limit_inf
=
limit_inf
,
limit_sup
=
limit_sup
,
session
=
session
)
compute_groups
(
corpus
,
limit_inf
=
limit_inf
,
limit_sup
=
limit_sup
,
mysession
=
my
session
)
except
Exception
as
error
:
print
(
"Workflow Ngram Group error"
,
error
)
pass
update_state
.
processing_
(
corpus
,
"Map list terms"
)
compute_mapList
(
corpus
,
limit
=
1000
,
session
=
session
)
# size
update_state
.
processing_
(
corpus
.
id
,
"Map list terms"
)
compute_mapList
(
corpus
,
limit
=
1000
,
mysession
=
my
session
)
# size
update_state
.
processing_
(
corpus
,
"TF-IDF local score"
)
compute_tfidf
(
corpus
,
session
=
session
)
update_state
.
processing_
(
corpus
.
id
,
"TF-IDF local score"
)
compute_tfidf
(
corpus
,
mysession
=
my
session
)
update_state
.
processing_
(
corpus
,
"Occurrences"
)
compute_occs
(
corpus
,
session
=
session
)
update_state
.
processing_
(
corpus
.
id
,
"Occurrences"
)
compute_occs
(
corpus
,
mysession
=
my
session
)
parsing/corpustools.py
View file @
3476d4a2
...
...
@@ -31,12 +31,11 @@ parsers = Parsers()
# resources management
def
add_resource
(
corpus
,
session
=
None
,
**
kwargs
):
def
add_resource
(
corpus
,
my
session
=
None
,
**
kwargs
):
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
if
mysession
is
None
:
from
gargantext_web.db
import
session
mysession
=
session
# only for tests
resource
=
Resource
(
guid
=
str
(
random
()),
**
kwargs
)
...
...
@@ -50,7 +49,7 @@ def add_resource(corpus, session=None, **kwargs):
f
.
close
()
resource
.
digest
=
h
.
hexdigest
()
# check if a resource on this node already has this hash
tmp_resource
=
(
session
tmp_resource
=
(
my
session
.
query
(
Resource
)
.
join
(
Node_Resource
,
Node_Resource
.
resource_id
==
Resource
.
id
)
.
filter
(
Resource
.
digest
==
resource
.
digest
)
...
...
@@ -59,28 +58,24 @@ def add_resource(corpus, session=None, **kwargs):
if
tmp_resource
is
not
None
:
return
tmp_resource
else
:
session
.
add
(
resource
)
session
.
commit
()
my
session
.
add
(
resource
)
my
session
.
commit
()
# link with the resource
node_resource
=
Node_Resource
(
node_id
=
corpus
.
id
,
resource_id
=
resource
.
id
,
parsed
=
False
,
)
session
.
add
(
node_resource
)
session
.
commit
()
# return result
mysession
.
add
(
node_resource
)
mysession
.
commit
()
return
resource
if
sessionToRemove
:
session
.
remove
()
def
parse_resources
(
corpus
,
user
=
None
,
user_id
=
None
,
session
=
None
):
def
parse_resources
(
corpus
,
user
=
None
,
user_id
=
None
,
my
session
=
None
):
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
if
mysession
is
None
:
from
gargantext_web.db
import
session
mysession
=
session
dbg
=
DebugTime
(
'Corpus #
%
d - parsing'
%
corpus
.
id
)
...
...
@@ -91,7 +86,7 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
else
:
user_id
=
corpus
.
user_id
# find resource of the corpus
resources_query
=
(
session
resources_query
=
(
my
session
.
query
(
Resource
,
ResourceType
)
.
join
(
ResourceType
,
ResourceType
.
id
==
Resource
.
type_id
)
.
join
(
Node_Resource
,
Node_Resource
.
resource_id
==
Resource
.
id
)
...
...
@@ -134,14 +129,14 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
# TODO: mark node-resources associations as parsed
#
dbg
.
show
(
'insert
%
d documents'
%
len
(
nodes
))
session
.
add_all
(
nodes
)
session
.
commit
()
my
session
.
add_all
(
nodes
)
my
session
.
commit
()
# now, index the hyperdata
dbg
.
show
(
'insert hyperdata'
)
node_hyperdata_lists
=
defaultdict
(
list
)
hyperdata_types
=
{
hyperdata
.
name
:
hyperdata
for
hyperdata
in
session
.
query
(
Hyperdata
)
for
hyperdata
in
my
session
.
query
(
Hyperdata
)
}
#print('hyperdata_types', hyperdata_types)
for
node
in
nodes
:
...
...
@@ -166,7 +161,7 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
node_hyperdata_ngrams
=
set
()
#for field in ['source', 'authors', 'journal']:
for
field
in
[
'journal'
,
'authors'
]:
hyperdata_set
.
add
(
session
.
query
(
Hyperdata
.
id
)
.
filter
(
Hyperdata
.
name
==
field
)
.
first
()[
0
])
hyperdata_set
.
add
(
my
session
.
query
(
Hyperdata
.
id
)
.
filter
(
Hyperdata
.
name
==
field
)
.
first
()[
0
])
#print("hyperdata_set", hyperdata_set)
...
...
@@ -191,9 +186,6 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
# mark the corpus as parsed
corpus
.
parsed
=
True
if
sessionToRemove
:
session
.
remove
()
# ngrams extraction
from
.NgramsExtractors
import
EnglishNgramsExtractor
,
FrenchNgramsExtractor
,
NgramsExtractor
from
nltk.tokenize
import
word_tokenize
,
wordpunct_tokenize
,
sent_tokenize
...
...
@@ -222,18 +214,17 @@ class NgramsExtractors(defaultdict):
ngramsextractors
=
NgramsExtractors
()
def
extract_ngrams
(
corpus
,
keys
,
nlp
=
True
,
session
=
None
):
def
extract_ngrams
(
corpus
,
keys
,
nlp
=
True
,
my
session
=
None
):
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
if
mysession
is
None
:
from
gargantext_web.db
import
session
mysession
=
session
dbg
=
DebugTime
(
'Corpus #
%
d - ngrams'
%
corpus
.
id
)
default_language_iso2
=
None
if
corpus
.
language_id
is
None
else
cache
.
Language
[
corpus
.
language_id
]
.
iso2
# query the hyperdata associated with the given keys
columns
=
[
Node
.
id
,
Node
.
language_id
]
+
[
Node
.
hyperdata
[
key
]
for
key
in
keys
]
hyperdata_query
=
(
session
hyperdata_query
=
(
my
session
.
query
(
*
columns
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Document'
]
.
id
)
...
...
@@ -242,7 +233,7 @@ def extract_ngrams(corpus, keys, nlp=True, session=None):
dbg
.
show
(
'find ngrams'
)
languages_by_id
=
{
language
.
id
:
language
.
iso2
for
language
in
session
.
query
(
Language
)
for
language
in
my
session
.
query
(
Language
)
}
ngrams_data
=
set
()
...
...
@@ -321,9 +312,6 @@ def extract_ngrams(corpus, keys, nlp=True, session=None):
# commit to database
db
.
commit
()
if
sessionToRemove
:
session
.
remove
()
def
text_prepa
(
my_str
):
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment