Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
3476d4a2
Commit
3476d4a2
authored
Jan 21, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FIX SESSIONS] local session for workflow and get_or_create_node are renamed to mysession.
parent
c54058e2
Changes
16
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
138 additions
and
212 deletions
+138
-212
utils.py
admin/utils.py
+3
-4
cooccurrences.py
analysis/cooccurrences.py
+14
-11
functions.py
analysis/functions.py
+3
-3
periods.py
analysis/periods.py
+2
-2
celery.py
gargantext_web/celery.py
+13
-19
db.py
gargantext_web/db.py
+10
-13
cvalue.py
ngram/cvalue.py
+3
-5
group.py
ngram/group.py
+9
-14
mapList.py
ngram/mapList.py
+15
-26
occurrences.py
ngram/occurrences.py
+4
-9
specificity.py
ngram/specificity.py
+9
-20
stop.py
ngram/stop.py
+6
-11
tfidf.py
ngram/tfidf.py
+6
-22
tools.py
ngram/tools.py
+2
-2
workflow.py
ngram/workflow.py
+15
-15
corpustools.py
parsing/corpustools.py
+24
-36
No files found.
admin/utils.py
View file @
3476d4a2
...
@@ -47,13 +47,12 @@ def PrintException():
...
@@ -47,13 +47,12 @@ def PrintException():
class
WorkflowTracking
:
class
WorkflowTracking
:
def
__init__
(
self
):
def
__init__
(
self
):
self
.
hola
=
"mundo"
self
.
hola
=
"mundo"
def
processing_
(
self
,
corpus
,
step
):
def
processing_
(
self
,
corpus
_id
,
step
):
try
:
try
:
the_query
=
""" UPDATE node_node SET hyperdata=
\'
{
\"
%
s
\"
:
\"
%
s
\"
}
\'
WHERE id=
%
d """
%
(
"Processing"
,
step
,
corpus
.
id
)
the_query
=
""" UPDATE node_node SET hyperdata=
\'
{
\"
%
s
\"
:
\"
%
s
\"
}
\'
WHERE id=
%
d """
%
(
"Processing"
,
step
,
corpus
_
id
)
cursor
=
connection
.
cursor
()
cursor
=
connection
.
cursor
()
try
:
try
:
cursor
.
execute
(
the_query
)
cursor
.
execute
(
the_query
)
...
@@ -61,4 +60,4 @@ class WorkflowTracking:
...
@@ -61,4 +60,4 @@ class WorkflowTracking:
finally
:
finally
:
connection
.
close
()
connection
.
close
()
except
:
except
:
PrintException
()
PrintException
()
\ No newline at end of file
analysis/cooccurrences.py
View file @
3476d4a2
...
@@ -18,7 +18,8 @@ def do_cooc(corpus=None
...
@@ -18,7 +18,8 @@ def do_cooc(corpus=None
,
start
=
None
,
end
=
None
,
start
=
None
,
end
=
None
,
limit
=
1000
,
limit
=
1000
,
isMonopartite
=
True
,
isMonopartite
=
True
,
hapax
=
3
):
,
hapax
=
3
,
mysession
=
None
):
'''
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
For the moment list of paramters are not supported because, lists need to
...
@@ -40,13 +41,16 @@ def do_cooc(corpus=None
...
@@ -40,13 +41,16 @@ def do_cooc(corpus=None
# Security test
# Security test
field1
,
field2
=
str
(
field1
),
str
(
field2
)
field1
,
field2
=
str
(
field1
),
str
(
field2
)
session
=
get_session
()
if
mysession
is
None
:
from
gargantext_web.db
import
session
mysession
=
session
# Get node
# Get node
node_cooc
=
get_or_create_node
(
nodetype
=
'Cooccurrence'
,
corpus
=
corpus
node_cooc
=
get_or_create_node
(
nodetype
=
'Cooccurrence'
,
corpus
=
corpus
,
name_str
=
"Cooccurrences corpus "
\
,
name_str
=
"Cooccurrences corpus "
\
+
str
(
corpus
.
id
)
+
"list_id: "
+
str
(
miam_id
)
+
str
(
corpus
.
id
)
+
"list_id: "
+
str
(
miam_id
)
#, hyperdata={'field1': field1, 'field2':field2}
#, hyperdata={'field1': field1, 'field2':field2}
,
session
=
session
)
,
mysession
=
my
session
)
# BEGIN
# BEGIN
...
@@ -60,12 +64,12 @@ def do_cooc(corpus=None
...
@@ -60,12 +64,12 @@ def do_cooc(corpus=None
#
#
# node_cooc.hyperdata = hyperdata
# node_cooc.hyperdata = hyperdata
#
#
session
.
add
(
node_cooc
)
my
session
.
add
(
node_cooc
)
session
.
commit
()
my
session
.
commit
()
# END
# END
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_cooc
.
id
)
.
delete
()
my
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_cooc
.
id
)
.
delete
()
session
.
commit
()
my
session
.
commit
()
doc_id
=
cache
.
NodeType
[
'Document'
]
.
id
doc_id
=
cache
.
NodeType
[
'Document'
]
.
id
...
@@ -77,7 +81,7 @@ def do_cooc(corpus=None
...
@@ -77,7 +81,7 @@ def do_cooc(corpus=None
if
isMonopartite
:
if
isMonopartite
:
NodeNgramY
=
aliased
(
NodeNgram
)
NodeNgramY
=
aliased
(
NodeNgram
)
cooc_query
=
(
session
.
query
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
,
cooc_score
)
cooc_query
=
(
my
session
.
query
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
,
cooc_score
)
.
join
(
Node
,
Node
.
id
==
NodeNgramX
.
node_id
)
.
join
(
Node
,
Node
.
id
==
NodeNgramX
.
node_id
)
.
join
(
NodeNgramY
,
NodeNgramY
.
node_id
==
Node
.
id
)
.
join
(
NodeNgramY
,
NodeNgramY
.
node_id
==
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
doc_id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
doc_id
)
...
@@ -85,7 +89,7 @@ def do_cooc(corpus=None
...
@@ -85,7 +89,7 @@ def do_cooc(corpus=None
else
:
else
:
NodeNgramY
=
aliased
(
NodeNgram
)
NodeNgramY
=
aliased
(
NodeNgram
)
cooc_query
=
(
session
.
query
(
NodeHyperdataNgram
.
ngram_id
,
NodeNgramY
.
ngram_id
,
cooc_score
)
cooc_query
=
(
my
session
.
query
(
NodeHyperdataNgram
.
ngram_id
,
NodeNgramY
.
ngram_id
,
cooc_score
)
.
join
(
Node
,
Node
.
id
==
NodeHyperdataNgram
.
node_id
)
.
join
(
Node
,
Node
.
id
==
NodeHyperdataNgram
.
node_id
)
.
join
(
NodeNgramY
,
NodeNgramY
.
node_id
==
Node
.
id
)
.
join
(
NodeNgramY
,
NodeNgramY
.
node_id
==
Node
.
id
)
.
join
(
Hyperdata
,
Hyperdata
.
id
==
NodeHyperdataNgram
.
hyperdata_id
)
.
join
(
Hyperdata
,
Hyperdata
.
id
==
NodeHyperdataNgram
.
hyperdata_id
)
...
@@ -169,7 +173,7 @@ def do_cooc(corpus=None
...
@@ -169,7 +173,7 @@ def do_cooc(corpus=None
# Select according some scores
# Select according some scores
if
cvalue_id
is
not
None
:
if
cvalue_id
is
not
None
:
#miam = get_or_create_node(nodetype='Cvalue', corpus=corpus)
#miam = get_or_create_node(nodetype='Cvalue', corpus=corpus)
cvalue_list
=
UnweightedList
(
session
.
query
(
NodeNodeNgram
.
ngram_id
)
cvalue_list
=
UnweightedList
(
my
session
.
query
(
NodeNodeNgram
.
ngram_id
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
cvalue_id
)
.
all
()
.
filter
(
NodeNodeNgram
.
nodex_id
==
cvalue_id
)
.
all
()
)
)
...
@@ -200,4 +204,3 @@ def do_cooc(corpus=None
...
@@ -200,4 +204,3 @@ def do_cooc(corpus=None
cooc
=
matrix
cooc
=
matrix
cooc
.
save
(
node_cooc
.
id
)
cooc
.
save
(
node_cooc
.
id
)
return
(
node_cooc
.
id
)
return
(
node_cooc
.
id
)
session
.
remove
()
analysis/functions.py
View file @
3476d4a2
...
@@ -44,9 +44,9 @@ def get_cooc(request=None, corpus=None
...
@@ -44,9 +44,9 @@ def get_cooc(request=None, corpus=None
data
=
{}
data
=
{}
#if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
#if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print
(
"Cooccurrences do not exist yet, creating it."
)
print
(
"Cooccurrences do not exist yet, creating it."
)
miam_id
=
get_or_create_node
(
nodetype
=
'MapList'
,
corpus
=
corpus
,
session
=
session
)
.
id
miam_id
=
get_or_create_node
(
nodetype
=
'MapList'
,
corpus
=
corpus
,
my
session
=
session
)
.
id
stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
,
session
=
session
)
.
id
stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
,
my
session
=
session
)
.
id
group_id
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
,
session
=
session
)
.
id
group_id
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
,
my
session
=
session
)
.
id
SamuelFlag
=
False
SamuelFlag
=
False
# if field1 == field2 == 'ngrams' :
# if field1 == field2 == 'ngrams' :
...
...
analysis/periods.py
View file @
3476d4a2
...
@@ -51,7 +51,7 @@ def periods(corpus, start=None, end=None):
...
@@ -51,7 +51,7 @@ def periods(corpus, start=None, end=None):
if
duration
.
days
>
365
*
3
:
if
duration
.
days
>
365
*
3
:
print
(
"OK"
)
print
(
"OK"
)
miam_id
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
.
id
miam_id
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
,
mysession
=
session
)
.
id
result_list
=
list
()
result_list
=
list
()
for
t
in
times
:
for
t
in
times
:
...
@@ -86,7 +86,7 @@ def jacquard(period1, period2):
...
@@ -86,7 +86,7 @@ def jacquard(period1, period2):
def
get_partition
(
corpus
,
start
=
None
,
end
=
None
,
distance
=
distance
):
def
get_partition
(
corpus
,
start
=
None
,
end
=
None
,
distance
=
distance
):
session
=
get_session
()
session
=
get_session
()
miam_id
=
get_or_create_node
(
corpus
=
corpus
,
nodetype
=
'MapList'
,
session
=
session
)
.
id
miam_id
=
get_or_create_node
(
corpus
=
corpus
,
nodetype
=
'MapList'
,
my
session
=
session
)
.
id
print
(
"get Partition
%
s -
%
s"
%
(
start
,
end
))
print
(
"get Partition
%
s -
%
s"
%
(
start
,
end
))
cooc_id
=
do_cooc
(
corpus
=
corpus
cooc_id
=
do_cooc
(
corpus
=
corpus
,
start
=
start
,
start
=
start
...
...
gargantext_web/celery.py
View file @
3476d4a2
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
from
celery
import
shared_task
from
celery
import
shared_task
from
node
import
models
from
node
import
models
from
django.db
import
transaction
from
django.db
import
transaction
from
admin.utils
import
DebugTime
from
admin.utils
import
DebugTime
,
PrintException
import
cProfile
import
cProfile
#@app.task(bind=True)
#@app.task(bind=True)
...
@@ -15,15 +15,8 @@ from gargantext_web.db import get_session, cache, Node
...
@@ -15,15 +15,8 @@ from gargantext_web.db import get_session, cache, Node
from
ngram.workflow
import
ngram_workflow
from
ngram.workflow
import
ngram_workflow
@
shared_task
def
apply_sum
(
x
,
y
):
print
(
x
+
y
)
session
=
get_session
()
print
(
session
.
query
(
Node
.
name
)
.
first
())
session
.
remove
()
from
parsing.corpustools
import
parse_resources
,
extract_ngrams
#add_resource,
from
parsing.corpustools
import
parse_resources
,
extract_ngrams
#add_resource,
from
ngram.lists
import
ngrams2miam
#
from ngram.lists import ngrams2miam
from
admin.utils
import
WorkflowTracking
from
admin.utils
import
WorkflowTracking
...
@@ -36,28 +29,29 @@ def apply_workflow(corpus_id):
...
@@ -36,28 +29,29 @@ def apply_workflow(corpus_id):
update_state
=
WorkflowTracking
()
update_state
=
WorkflowTracking
()
try
:
try
:
session
=
get_session
()
my
session
=
get_session
()
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
corpus
=
my
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
update_state
.
processing_
(
corpus
,
"Parsing"
)
update_state
.
processing_
(
int
(
corpus_id
)
,
"Parsing"
)
#cProfile.runctx('parse_resources(corpus)', global,locals)
#cProfile.runctx('parse_resources(corpus)', global,locals)
parse_resources
(
corpus
,
session
=
session
)
parse_resources
(
corpus
,
mysession
=
my
session
)
update_state
.
processing_
(
corpus
,
"Terms extraction"
)
update_state
.
processing_
(
int
(
corpus_id
)
,
"Terms extraction"
)
extract_ngrams
(
corpus
,
[
'title'
,
'abstract'
],
nlp
=
True
,
session
=
session
)
extract_ngrams
(
corpus
,
[
'title'
,
'abstract'
],
nlp
=
True
,
mysession
=
my
session
)
# update_state.processing_(corpus, "")
# update_state.processing_(corpus, "")
ngram_workflow
(
corpus
,
session
=
session
)
ngram_workflow
(
corpus
,
mysession
=
my
session
)
#ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
#ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
print
(
"End of the Workflow for corpus
%
d"
%
(
corpus_id
))
print
(
"End of the Workflow for corpus
%
d"
%
(
corpus_id
))
update_state
.
processing_
(
corpus
,
"0"
)
update_state
.
processing_
(
int
(
corpus_id
)
,
"0"
)
session
.
remove
()
my
session
.
remove
()
except
Exception
as
error
:
except
Exception
as
error
:
print
(
error
)
print
(
error
)
session
.
remove
()
PrintException
()
mysession
.
remove
()
@
shared_task
@
shared_task
def
empty_trash
(
corpus_id
):
def
empty_trash
(
corpus_id
):
...
...
gargantext_web/db.py
View file @
3476d4a2
...
@@ -242,17 +242,16 @@ class bulk_insert:
...
@@ -242,17 +242,16 @@ class bulk_insert:
readline
=
read
readline
=
read
def
get_or_create_node
(
nodetype
=
None
,
corpus
=
None
,
corpus_id
=
None
,
name_str
=
None
,
hyperdata
=
None
,
session
=
None
):
def
get_or_create_node
(
nodetype
=
None
,
corpus
=
None
,
corpus_id
=
None
,
name_str
=
None
,
hyperdata
=
None
,
my
session
=
None
):
'''
'''
Should be a method of the object. __get_or_create__ ?
Should be a method of the object. __get_or_create__ ?
name_str :: String
name_str :: String
hyperdata :: Dict
hyperdata :: Dict
'''
'''
sessionToRemove
=
False
if
mysession
is
None
:
if
session
is
None
:
from
gargantext_web.db
import
session
session
=
get_session
()
mysession
=
session
sessionToRemove
=
True
if
nodetype
is
None
:
if
nodetype
is
None
:
print
(
"Need to give a type node"
)
print
(
"Need to give a type node"
)
...
@@ -262,13 +261,13 @@ def get_or_create_node(nodetype=None,corpus=None,corpus_id=None,name_str=None,hy
...
@@ -262,13 +261,13 @@ def get_or_create_node(nodetype=None,corpus=None,corpus_id=None,name_str=None,hy
except
KeyError
:
except
KeyError
:
ntype
=
cache
.
NodeType
[
nodetype
]
=
NodeType
()
ntype
=
cache
.
NodeType
[
nodetype
]
=
NodeType
()
ntype
.
name
=
nodetype
ntype
.
name
=
nodetype
session
.
add
(
ntype
)
my
session
.
add
(
ntype
)
session
.
commit
()
my
session
.
commit
()
if
corpus_id
is
not
None
and
corpus
is
None
:
if
corpus_id
is
not
None
and
corpus
is
None
:
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
corpus
=
my
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
node
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
ntype
.
id
node
=
(
my
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
ntype
.
id
,
Node
.
parent_id
==
corpus
.
id
,
Node
.
parent_id
==
corpus
.
id
,
Node
.
user_id
==
corpus
.
user_id
,
Node
.
user_id
==
corpus
.
user_id
)
)
...
@@ -289,11 +288,9 @@ def get_or_create_node(nodetype=None,corpus=None,corpus_id=None,name_str=None,hy
...
@@ -289,11 +288,9 @@ def get_or_create_node(nodetype=None,corpus=None,corpus_id=None,name_str=None,hy
node
.
name
=
name_str
node
.
name
=
name_str
else
:
else
:
node
.
name
=
ntype
.
name
node
.
name
=
ntype
.
name
session
.
add
(
node
)
my
session
.
add
(
node
)
session
.
commit
()
my
session
.
commit
()
#print(parent_id, n.parent_id, n.id, n.name)
#print(parent_id, n.parent_id, n.id, n.name)
return
(
node
)
return
(
node
)
if
sessionToRemove
:
session
.
remove
()
ngram/cvalue.py
View file @
3476d4a2
...
@@ -67,7 +67,7 @@ def getNgrams(corpus=None, limit=1000):
...
@@ -67,7 +67,7 @@ def getNgrams(corpus=None, limit=1000):
return
(
terms
)
return
(
terms
)
session
.
remove
()
session
.
remove
()
def
compute_cvalue
(
corpus
=
None
,
limit
=
1000
):
def
compute_cvalue
(
corpus
=
None
,
limit
=
1000
,
mysession
=
None
):
'''
'''
computeCvalue :: Corpus
computeCvalue :: Corpus
frequency :: String -> Int -> Int
frequency :: String -> Int -> Int
...
@@ -126,13 +126,11 @@ def compute_cvalue(corpus=None, limit=1000):
...
@@ -126,13 +126,11 @@ def compute_cvalue(corpus=None, limit=1000):
result
=
cvalueAll
()
result
=
cvalueAll
()
#print([n for n in result])
#print([n for n in result])
session
=
get_session
()
mysession
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
cvalue_node
.
id
)
.
delete
()
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
cvalue_node
.
id
)
.
delete
()
mysession
.
commit
()
session
.
commit
()
#bulk_insert(NodeNodeNgram, ['nodex_id', 'nodey_id', 'ngram_id', 'score'], [n for n in islice(result,0,100)])
#bulk_insert(NodeNodeNgram, ['nodex_id', 'nodey_id', 'ngram_id', 'score'], [n for n in islice(result,0,100)])
bulk_insert
(
NodeNodeNgram
,
[
'nodex_id'
,
'nodey_id'
,
'ngram_id'
,
'score'
],
[
n
for
n
in
result
])
bulk_insert
(
NodeNodeNgram
,
[
'nodex_id'
,
'nodey_id'
,
'ngram_id'
,
'score'
],
[
n
for
n
in
result
])
session
.
remove
()
# test
# test
#corpus=session.query(Node).filter(Node.id==244250).first()
#corpus=session.query(Node).filter(Node.id==244250).first()
#computeCvalue(corpus)
#computeCvalue(corpus)
ngram/group.py
View file @
3476d4a2
...
@@ -47,14 +47,10 @@ def getStemmer(corpus):
...
@@ -47,14 +47,10 @@ def getStemmer(corpus):
return
(
stemIt
)
return
(
stemIt
)
def
compute_groups
(
corpus
,
limit_inf
=
None
,
limit_sup
=
None
,
how
=
'Stem'
,
session
=
None
):
def
compute_groups
(
corpus
,
limit_inf
=
None
,
limit_sup
=
None
,
how
=
'Stem'
,
my
session
=
None
):
'''
'''
group ngrams according to a function (stemming or lemming)
group ngrams according to a function (stemming or lemming)
'''
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - group'
%
corpus
.
id
)
dbg
=
DebugTime
(
'Corpus #
%
d - group'
%
corpus
.
id
)
dbg
.
show
(
'Group'
)
dbg
.
show
(
'Group'
)
...
@@ -66,19 +62,19 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
...
@@ -66,19 +62,19 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
stemIt
=
getStemmer
(
corpus
)
stemIt
=
getStemmer
(
corpus
)
group_to_insert
=
set
()
group_to_insert
=
set
()
node_group
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
,
session
=
session
)
node_group
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
,
mysession
=
my
session
)
miam_to_insert
=
set
()
miam_to_insert
=
set
()
miam_node
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
,
session
=
session
)
miam_node
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
,
mysession
=
my
session
)
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
,
session
=
session
)
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
,
mysession
=
my
session
)
#stop_list = UnweightedList(stop_node.id)
#stop_list = UnweightedList(stop_node.id)
Stop
=
aliased
(
NodeNgram
)
Stop
=
aliased
(
NodeNgram
)
frequency
=
sa
.
func
.
count
(
NodeNgram
.
weight
)
frequency
=
sa
.
func
.
count
(
NodeNgram
.
weight
)
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
ngrams
=
(
my
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
#.outerjoin(Stop, Stop.ngram_id == Ngram.id)
#.outerjoin(Stop, Stop.ngram_id == Ngram.id)
...
@@ -90,7 +86,7 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
...
@@ -90,7 +86,7 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
.
limit
(
limit_sup
)
.
limit
(
limit_sup
)
)
)
stops
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
stops
=
(
my
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
join
(
Stop
,
Stop
.
ngram_id
==
Ngram
.
id
)
.
join
(
Stop
,
Stop
.
ngram_id
==
Ngram
.
id
)
...
@@ -131,10 +127,10 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
...
@@ -131,10 +127,10 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
miam_to_insert
.
add
((
miam_node
.
id
,
group
[
key
][
'mainForm'
],
1
))
miam_to_insert
.
add
((
miam_node
.
id
,
group
[
key
][
'mainForm'
],
1
))
# # Deleting previous groups
# # Deleting previous groups
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_group
.
id
)
.
delete
()
my
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_group
.
id
)
.
delete
()
# # Deleting previous ngrams miam list
# # Deleting previous ngrams miam list
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
miam_node
.
id
)
.
delete
()
my
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
miam_node
.
id
)
.
delete
()
session
.
commit
()
my
session
.
commit
()
bulk_insert
(
NodeNgramNgram
bulk_insert
(
NodeNgramNgram
,
(
'node_id'
,
'ngramx_id'
,
'ngramy_id'
,
'score'
)
,
(
'node_id'
,
'ngramx_id'
,
'ngramy_id'
,
'score'
)
...
@@ -142,4 +138,3 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
...
@@ -142,4 +138,3 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
bulk_insert
(
NodeNgram
,
(
'node_id'
,
'ngram_id'
,
'weight'
),
[
data
for
data
in
list
(
miam_to_insert
)])
bulk_insert
(
NodeNgram
,
(
'node_id'
,
'ngram_id'
,
'weight'
),
[
data
for
data
in
list
(
miam_to_insert
)])
if
sessionToRemove
:
session
.
remove
()
ngram/mapList.py
View file @
3476d4a2
...
@@ -15,15 +15,11 @@ from sqlalchemy.orm import aliased
...
@@ -15,15 +15,11 @@ from sqlalchemy.orm import aliased
from
ngram.tools
import
insert_ngrams
from
ngram.tools
import
insert_ngrams
import
csv
import
csv
def
compute_mapList
(
corpus
,
limit
=
500
,
n
=
1
,
session
=
None
):
def
compute_mapList
(
corpus
,
limit
=
500
,
n
=
1
,
my
session
=
None
):
'''
'''
According to Specificities and stoplist,
According to Specificities and stoplist,
'''
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
monograms_part
=
0.005
monograms_part
=
0.005
monograms_limit
=
round
(
limit
*
monograms_part
)
monograms_limit
=
round
(
limit
*
monograms_part
)
...
@@ -31,11 +27,11 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
...
@@ -31,11 +27,11 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
node_miam
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
node_miam
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
,
mysession
=
mysession
)
node_stop
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
node_stop
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
,
mysession
=
mysession
)
node_group
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
)
node_group
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
,
mysession
=
mysession
)
node_spec
=
get_or_create_node
(
nodetype
=
'Specificity'
,
corpus
=
corpus
)
node_spec
=
get_or_create_node
(
nodetype
=
'Specificity'
,
corpus
=
corpus
,
mysession
=
mysession
)
Miam
=
aliased
(
NodeNgram
)
Miam
=
aliased
(
NodeNgram
)
Stop
=
aliased
(
NodeNgram
)
Stop
=
aliased
(
NodeNgram
)
...
@@ -43,7 +39,7 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
...
@@ -43,7 +39,7 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
Spec
=
aliased
(
NodeNodeNgram
)
Spec
=
aliased
(
NodeNodeNgram
)
query
=
(
session
.
query
(
Spec
.
ngram_id
,
Spec
.
score
)
query
=
(
my
session
.
query
(
Spec
.
ngram_id
,
Spec
.
score
)
.
join
(
Miam
,
Spec
.
ngram_id
==
Miam
.
ngram_id
)
.
join
(
Miam
,
Spec
.
ngram_id
==
Miam
.
ngram_id
)
.
join
(
Ngram
,
Ngram
.
id
==
Spec
.
ngram_id
)
.
join
(
Ngram
,
Ngram
.
id
==
Spec
.
ngram_id
)
#.outerjoin(Group, Group.ngramy_id == Spec.ngram_id)
#.outerjoin(Group, Group.ngramy_id == Spec.ngram_id)
...
@@ -66,19 +62,19 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
...
@@ -66,19 +62,19 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
.
limit
(
multigrams_limit
)
.
limit
(
multigrams_limit
)
)
)
stop_ngrams
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
stop_ngrams
=
(
my
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
node_stop
.
id
)
.
filter
(
NodeNgram
.
node_id
==
node_stop
.
id
)
.
all
()
.
all
()
)
)
grouped_ngrams
=
(
session
.
query
(
NodeNgramNgram
.
ngramy_id
)
grouped_ngrams
=
(
my
session
.
query
(
NodeNgramNgram
.
ngramy_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_group
.
id
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_group
.
id
)
.
all
()
.
all
()
)
)
node_mapList
=
get_or_create_node
(
nodetype
=
'MapList'
,
corpus
=
corpus
)
node_mapList
=
get_or_create_node
(
nodetype
=
'MapList'
,
corpus
=
corpus
,
mysession
=
mysession
)
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_mapList
.
id
)
.
delete
()
my
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_mapList
.
id
)
.
delete
()
session
.
commit
()
my
session
.
commit
()
data
=
zip
(
data
=
zip
(
[
node_mapList
.
id
for
i
in
range
(
1
,
limit
)]
[
node_mapList
.
id
for
i
in
range
(
1
,
limit
)]
...
@@ -91,20 +87,14 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
...
@@ -91,20 +87,14 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
dbg
.
show
(
'MapList computed'
)
dbg
.
show
(
'MapList computed'
)
if
sessionToRemove
:
session
.
remove
()
def
insert_miam
(
corpus
,
ngrams
=
None
,
path_file_csv
=
None
):
def
insert_miam
(
corpus
,
ngrams
=
None
,
path_file_csv
=
None
,
mysession
=
None
):
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
node_miam
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
node_miam
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
,
mysession
=
mysession
)
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_miam
.
id
)
.
delete
()
my
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_miam
.
id
)
.
delete
()
session
.
commit
()
my
session
.
commit
()
stop_words
=
set
()
stop_words
=
set
()
miam_words
=
set
()
miam_words
=
set
()
...
@@ -133,6 +123,5 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
...
@@ -133,6 +123,5 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
file_csv
.
close
()
file_csv
.
close
()
dbg
.
show
(
'Miam computed'
)
dbg
.
show
(
'Miam computed'
)
if
sessionToRemove
:
session
.
remove
()
ngram/occurrences.py
View file @
3476d4a2
...
@@ -4,26 +4,22 @@ from gargantext_web.db import Node, NodeNgram, NodeNodeNgram
...
@@ -4,26 +4,22 @@ from gargantext_web.db import Node, NodeNgram, NodeNodeNgram
from
gargantext_web.db
import
get_or_create_node
from
gargantext_web.db
import
get_or_create_node
from
admin.utils
import
DebugTime
from
admin.utils
import
DebugTime
def
compute_occs
(
corpus
,
session
=
None
):
def
compute_occs
(
corpus
,
my
session
=
None
):
'''
'''
compute_occs :: Corpus -> IO ()
compute_occs :: Corpus -> IO ()
'''
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - OCCURRENCES'
%
corpus
.
id
)
dbg
=
DebugTime
(
'Corpus #
%
d - OCCURRENCES'
%
corpus
.
id
)
dbg
.
show
(
'Calculate occurrences'
)
dbg
.
show
(
'Calculate occurrences'
)
occs_node
=
get_or_create_node
(
nodetype
=
'Occurrences'
,
corpus
=
corpus
)
occs_node
=
get_or_create_node
(
nodetype
=
'Occurrences'
,
corpus
=
corpus
,
mysession
=
mysession
)
#print(occs_node.id)
#print(occs_node.id)
(
session
.
query
(
NodeNodeNgram
)
(
my
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
occs_node
.
id
)
.
delete
()
.
filter
(
NodeNodeNgram
.
nodex_id
==
occs_node
.
id
)
.
delete
()
)
)
session
.
commit
()
my
session
.
commit
()
db
,
cursor
=
get_cursor
()
db
,
cursor
=
get_cursor
()
cursor
.
execute
(
'''
cursor
.
execute
(
'''
...
@@ -55,7 +51,6 @@ def compute_occs(corpus, session=None):
...
@@ -55,7 +51,6 @@ def compute_occs(corpus, session=None):
)
)
)
)
db
.
commit
()
db
.
commit
()
if
sessionToRemove
:
session
.
remove
()
#data = session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==occs_node.id).all()
#data = session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==occs_node.id).all()
...
...
ngram/specificity.py
View file @
3476d4a2
...
@@ -15,16 +15,12 @@ from gargantext_web.db import NodeNgramNgram, NodeNodeNgram
...
@@ -15,16 +15,12 @@ from gargantext_web.db import NodeNgramNgram, NodeNodeNgram
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
def
specificity
(
cooc_id
=
None
,
corpus
=
None
,
limit
=
100
,
session
=
None
):
def
specificity
(
cooc_id
=
None
,
corpus
=
None
,
limit
=
100
,
my
session
=
None
):
'''
'''
Compute the specificity, simple calculus.
Compute the specificity, simple calculus.
'''
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
cooccurrences
=
(
my
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
.
order_by
(
NodeNgramNgram
.
score
)
.
order_by
(
NodeNgramNgram
.
score
)
.
limit
(
limit
)
.
limit
(
limit
)
...
@@ -45,23 +41,22 @@ def specificity(cooc_id=None, corpus=None, limit=100, session=None):
...
@@ -45,23 +41,22 @@ def specificity(cooc_id=None, corpus=None, limit=100, session=None):
m
=
(
xs
-
ys
)
/
(
2
*
(
x
.
shape
[
0
]
-
1
))
m
=
(
xs
-
ys
)
/
(
2
*
(
x
.
shape
[
0
]
-
1
))
m
=
m
.
sort
(
inplace
=
False
)
m
=
m
.
sort
(
inplace
=
False
)
node
=
get_or_create_node
(
nodetype
=
'Specificity'
,
corpus
=
corpus
,
session
=
session
)
node
=
get_or_create_node
(
nodetype
=
'Specificity'
,
corpus
=
corpus
,
mysession
=
my
session
)
data
=
zip
(
[
node
.
id
for
i
in
range
(
1
,
m
.
shape
[
0
])]
data
=
zip
(
[
node
.
id
for
i
in
range
(
1
,
m
.
shape
[
0
])]
,
[
corpus
.
id
for
i
in
range
(
1
,
m
.
shape
[
0
])]
,
[
corpus
.
id
for
i
in
range
(
1
,
m
.
shape
[
0
])]
,
m
.
index
.
tolist
()
,
m
.
index
.
tolist
()
,
m
.
values
.
tolist
()
,
m
.
values
.
tolist
()
)
)
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
node
.
id
)
.
delete
()
my
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
node
.
id
)
.
delete
()
session
.
commit
()
my
session
.
commit
()
bulk_insert
(
NodeNodeNgram
,
[
'nodex_id'
,
'nodey_id'
,
'ngram_id'
,
'score'
],
[
d
for
d
in
data
])
bulk_insert
(
NodeNodeNgram
,
[
'nodex_id'
,
'nodey_id'
,
'ngram_id'
,
'score'
],
[
d
for
d
in
data
])
return
(
node
.
id
)
return
(
node
.
id
)
if
sessionToRemove
:
session
.
remove
()
def
compute_specificity
(
corpus
,
limit
=
100
,
session
=
None
):
def
compute_specificity
(
corpus
,
limit
=
100
,
my
session
=
None
):
'''
'''
Computing specificities as NodeNodeNgram.
Computing specificities as NodeNodeNgram.
All workflow is the following:
All workflow is the following:
...
@@ -69,19 +64,13 @@ def compute_specificity(corpus,limit=100, session=None):
...
@@ -69,19 +64,13 @@ def compute_specificity(corpus,limit=100, session=None):
2) Compute the specificity score, saving it in database, return its Node
2) Compute the specificity score, saving it in database, return its Node
'''
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - specificity'
%
corpus
.
id
)
dbg
=
DebugTime
(
'Corpus #
%
d - specificity'
%
corpus
.
id
)
list_cvalue
=
get_or_create_node
(
nodetype
=
'Cvalue'
,
corpus
=
corpus
,
session
=
session
)
list_cvalue
=
get_or_create_node
(
nodetype
=
'Cvalue'
,
corpus
=
corpus
,
mysession
=
my
session
)
cooc_id
=
do_cooc
(
corpus
=
corpus
,
cvalue_id
=
list_cvalue
.
id
,
limit
=
limit
)
cooc_id
=
do_cooc
(
corpus
=
corpus
,
cvalue_id
=
list_cvalue
.
id
,
limit
=
limit
,
mysession
=
mysession
)
specificity
(
cooc_id
=
cooc_id
,
corpus
=
corpus
,
limit
=
limit
)
specificity
(
cooc_id
=
cooc_id
,
corpus
=
corpus
,
limit
=
limit
,
mysession
=
mysession
)
dbg
.
show
(
'specificity'
)
dbg
.
show
(
'specificity'
)
if
sessionToRemove
:
session
.
remove
()
#corpus=session.query(Node).filter(Node.id==244250).first()
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_specificity(corpus)
#compute_specificity(corpus)
...
...
ngram/stop.py
View file @
3476d4a2
...
@@ -75,22 +75,18 @@ def isStopWord(ngram, stop_words=None):
...
@@ -75,22 +75,18 @@ def isStopWord(ngram, stop_words=None):
if
test_match
(
word
,
regex
)
is
True
:
if
test_match
(
word
,
regex
)
is
True
:
return
(
True
)
return
(
True
)
def
compute_stop
(
corpus
,
limit
=
2000
,
debug
=
False
,
session
=
None
):
def
compute_stop
(
corpus
,
limit
=
2000
,
debug
=
False
,
my
session
=
None
):
'''
'''
do some statitics on all stop lists of database of the same type
do some statitics on all stop lists of database of the same type
'''
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
stop_node_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
.
id
stop_node_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
,
mysession
=
mysession
)
.
id
# TODO do a function to get all stop words with social scores
# TODO do a function to get all stop words with social scores
root
=
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Root'
]
.
id
)
.
first
()
root
=
my
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Root'
]
.
id
)
.
first
()
root_stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
root
)
.
id
root_stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
root
,
mysession
=
mysession
)
.
id
stop_words
=
(
session
.
query
(
Ngram
.
terms
)
stop_words
=
(
my
session
.
query
(
Ngram
.
terms
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
filter
(
NodeNgram
.
node_id
==
root_stop_id
)
.
filter
(
NodeNgram
.
node_id
==
root_stop_id
)
.
all
()
.
all
()
...
@@ -99,7 +95,7 @@ def compute_stop(corpus,limit=2000,debug=False, session=None):
...
@@ -99,7 +95,7 @@ def compute_stop(corpus,limit=2000,debug=False, session=None):
#print([n for n in stop_words])
#print([n for n in stop_words])
frequency
=
sa
.
func
.
count
(
NodeNgram
.
weight
)
frequency
=
sa
.
func
.
count
(
NodeNgram
.
weight
)
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
ngrams
=
(
my
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
...
@@ -118,4 +114,3 @@ def compute_stop(corpus,limit=2000,debug=False, session=None):
...
@@ -118,4 +114,3 @@ def compute_stop(corpus,limit=2000,debug=False, session=None):
stop
=
WeightedList
({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
stop
=
WeightedList
({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
stop
.
save
(
stop_node_id
)
stop
.
save
(
stop_node_id
)
if
sessionToRemove
:
session
.
remove
()
ngram/tfidf.py
View file @
3476d4a2
...
@@ -5,17 +5,12 @@ from gargantext_web.db import get_session, get_or_create_node
...
@@ -5,17 +5,12 @@ from gargantext_web.db import get_session, get_or_create_node
from
admin.utils
import
DebugTime
from
admin.utils
import
DebugTime
def
compute_tfidf
(
corpus
,
session
=
None
):
def
compute_tfidf
(
corpus
,
my
session
=
None
):
# compute terms frequency sum
# compute terms frequency sum
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - TFIDF'
%
corpus
.
id
)
dbg
=
DebugTime
(
'Corpus #
%
d - TFIDF'
%
corpus
.
id
)
dbg
.
show
(
'calculate terms frequencies sums'
)
dbg
.
show
(
'calculate terms frequencies sums'
)
tfidf_node
=
get_or_create_node
(
nodetype
=
'Tfidf'
,
corpus
=
corpus
,
session
=
session
)
tfidf_node
=
get_or_create_node
(
nodetype
=
'Tfidf'
,
corpus
=
corpus
,
mysession
=
my
session
)
db
,
cursor
=
get_cursor
()
db
,
cursor
=
get_cursor
()
cursor
.
execute
(
'''
cursor
.
execute
(
'''
...
@@ -125,26 +120,20 @@ def compute_tfidf(corpus, session=None):
...
@@ -125,26 +120,20 @@ def compute_tfidf(corpus, session=None):
# the end!
# the end!
db
.
commit
()
db
.
commit
()
if
sessionToRemove
:
session
.
remove
()
def
compute_tfidf_global
(
corpus
,
session
=
None
):
def
compute_tfidf_global
(
corpus
,
my
session
=
None
):
'''
'''
Maybe improve this with:
Maybe improve this with:
#http://stackoverflow.com/questions/8674718/best-way-to-select-random-rows-postgresql
#http://stackoverflow.com/questions/8674718/best-way-to-select-random-rows-postgresql
'''
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - tfidf global'
%
corpus
.
id
)
dbg
=
DebugTime
(
'Corpus #
%
d - tfidf global'
%
corpus
.
id
)
dbg
.
show
(
'calculate terms frequencies sums'
)
dbg
.
show
(
'calculate terms frequencies sums'
)
tfidf_node
=
get_or_create_node
(
nodetype
=
'Tfidf (global)'
,
corpus
=
corpus
,
session
=
session
)
tfidf_node
=
get_or_create_node
(
nodetype
=
'Tfidf (global)'
,
corpus
=
corpus
,
mysession
=
my
session
)
# update would be better
# update would be better
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
tfidf_node
.
id
)
.
delete
()
my
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
tfidf_node
.
id
)
.
delete
()
session
.
commit
()
my
session
.
commit
()
# compute terms frequency sum
# compute terms frequency sum
db
,
cursor
=
get_cursor
()
db
,
cursor
=
get_cursor
()
...
@@ -271,8 +260,3 @@ def compute_tfidf_global(corpus, session=None):
...
@@ -271,8 +260,3 @@ def compute_tfidf_global(corpus, session=None):
db
.
commit
()
db
.
commit
()
dbg
.
show
(
'insert tfidf'
)
dbg
.
show
(
'insert tfidf'
)
if
sessionToRemove
:
session
.
remove
()
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_tfidf_global(corpus)
ngram/tools.py
View file @
3476d4a2
...
@@ -8,8 +8,8 @@ def insert_ngrams_to_list(list_of_ngrams, corpus, list_type='MapList', erase=Tru
...
@@ -8,8 +8,8 @@ def insert_ngrams_to_list(list_of_ngrams, corpus, list_type='MapList', erase=Tru
'''
'''
session
=
get_session
()
session
=
get_session
()
list_node
=
get_or_create_node
(
corpus
=
corpus
,
nodetype
=
list_type
,
session
=
session
)
list_node
=
get_or_create_node
(
corpus
=
corpus
,
nodetype
=
list_type
,
my
session
=
session
)
group_node
=
get_or_create_node
(
corpus
=
corpus
,
nodetype
=
'GroupList'
,
session
=
session
)
group_node
=
get_or_create_node
(
corpus
=
corpus
,
nodetype
=
'GroupList'
,
my
session
=
session
)
group_list
=
(
session
.
query
(
NodeNgramNgram
.
ngramy_id
)
group_list
=
(
session
.
query
(
NodeNgramNgram
.
ngramy_id
)
.
filter
(
NodeNgramNgram
.
id
==
group_node
.
id
)
.
filter
(
NodeNgramNgram
.
id
==
group_node
.
id
)
.
all
()
.
all
()
...
...
ngram/workflow.py
View file @
3476d4a2
...
@@ -12,17 +12,17 @@ from gargantext_web.db import Node , NodeNgram
...
@@ -12,17 +12,17 @@ from gargantext_web.db import Node , NodeNgram
from
admin.utils
import
WorkflowTracking
from
admin.utils
import
WorkflowTracking
def
ngram_workflow
(
corpus
,
n
=
5000
,
session
=
None
):
def
ngram_workflow
(
corpus
,
n
=
5000
,
my
session
=
None
):
'''
'''
All the workflow to filter the ngrams.
All the workflow to filter the ngrams.
'''
'''
update_state
=
WorkflowTracking
()
update_state
=
WorkflowTracking
()
update_state
.
processing_
(
corpus
,
"Stop words"
)
update_state
.
processing_
(
corpus
.
id
,
"Stop words"
)
compute_stop
(
corpus
,
session
=
session
)
compute_stop
(
corpus
,
mysession
=
my
session
)
update_state
.
processing_
(
corpus
,
"TF-IDF global score"
)
update_state
.
processing_
(
corpus
.
id
,
"TF-IDF global score"
)
compute_tfidf_global
(
corpus
,
session
=
session
)
compute_tfidf_global
(
corpus
,
mysession
=
my
session
)
part
=
round
(
n
*
0.9
)
part
=
round
(
n
*
0.9
)
...
@@ -31,28 +31,28 @@ def ngram_workflow(corpus, n=5000, session=None):
...
@@ -31,28 +31,28 @@ def ngram_workflow(corpus, n=5000, session=None):
# part = round(part * 0.8)
# part = round(part * 0.8)
#print('spec part:', part)
#print('spec part:', part)
update_state
.
processing_
(
corpus
,
"Specificity score"
)
update_state
.
processing_
(
corpus
.
id
,
"Specificity score"
)
compute_specificity
(
corpus
,
limit
=
part
,
session
=
session
)
compute_specificity
(
corpus
,
limit
=
part
,
mysession
=
my
session
)
part
=
round
(
part
*
0.8
)
part
=
round
(
part
*
0.8
)
limit_inf
=
round
(
part
*
1
)
limit_inf
=
round
(
part
*
1
)
limit_sup
=
round
(
part
*
5
)
limit_sup
=
round
(
part
*
5
)
#print(limit_inf,limit_sup)
#print(limit_inf,limit_sup)
update_state
.
processing_
(
corpus
,
"Synonyms"
)
update_state
.
processing_
(
corpus
.
id
,
"Synonyms"
)
try
:
try
:
compute_groups
(
corpus
,
limit_inf
=
limit_inf
,
limit_sup
=
limit_sup
,
session
=
session
)
compute_groups
(
corpus
,
limit_inf
=
limit_inf
,
limit_sup
=
limit_sup
,
mysession
=
my
session
)
except
Exception
as
error
:
except
Exception
as
error
:
print
(
"Workflow Ngram Group error"
,
error
)
print
(
"Workflow Ngram Group error"
,
error
)
pass
pass
update_state
.
processing_
(
corpus
,
"Map list terms"
)
update_state
.
processing_
(
corpus
.
id
,
"Map list terms"
)
compute_mapList
(
corpus
,
limit
=
1000
,
session
=
session
)
# size
compute_mapList
(
corpus
,
limit
=
1000
,
mysession
=
my
session
)
# size
update_state
.
processing_
(
corpus
,
"TF-IDF local score"
)
update_state
.
processing_
(
corpus
.
id
,
"TF-IDF local score"
)
compute_tfidf
(
corpus
,
session
=
session
)
compute_tfidf
(
corpus
,
mysession
=
my
session
)
update_state
.
processing_
(
corpus
,
"Occurrences"
)
update_state
.
processing_
(
corpus
.
id
,
"Occurrences"
)
compute_occs
(
corpus
,
session
=
session
)
compute_occs
(
corpus
,
mysession
=
my
session
)
parsing/corpustools.py
View file @
3476d4a2
...
@@ -31,12 +31,11 @@ parsers = Parsers()
...
@@ -31,12 +31,11 @@ parsers = Parsers()
# resources management
# resources management
def
add_resource
(
corpus
,
session
=
None
,
**
kwargs
):
def
add_resource
(
corpus
,
my
session
=
None
,
**
kwargs
):
sessionToRemove
=
False
if
mysession
is
None
:
if
session
is
None
:
from
gargantext_web.db
import
session
session
=
get_session
()
mysession
=
session
sessionToRemove
=
True
# only for tests
# only for tests
resource
=
Resource
(
guid
=
str
(
random
()),
**
kwargs
)
resource
=
Resource
(
guid
=
str
(
random
()),
**
kwargs
)
...
@@ -50,7 +49,7 @@ def add_resource(corpus, session=None, **kwargs):
...
@@ -50,7 +49,7 @@ def add_resource(corpus, session=None, **kwargs):
f
.
close
()
f
.
close
()
resource
.
digest
=
h
.
hexdigest
()
resource
.
digest
=
h
.
hexdigest
()
# check if a resource on this node already has this hash
# check if a resource on this node already has this hash
tmp_resource
=
(
session
tmp_resource
=
(
my
session
.
query
(
Resource
)
.
query
(
Resource
)
.
join
(
Node_Resource
,
Node_Resource
.
resource_id
==
Resource
.
id
)
.
join
(
Node_Resource
,
Node_Resource
.
resource_id
==
Resource
.
id
)
.
filter
(
Resource
.
digest
==
resource
.
digest
)
.
filter
(
Resource
.
digest
==
resource
.
digest
)
...
@@ -59,28 +58,24 @@ def add_resource(corpus, session=None, **kwargs):
...
@@ -59,28 +58,24 @@ def add_resource(corpus, session=None, **kwargs):
if
tmp_resource
is
not
None
:
if
tmp_resource
is
not
None
:
return
tmp_resource
return
tmp_resource
else
:
else
:
session
.
add
(
resource
)
my
session
.
add
(
resource
)
session
.
commit
()
my
session
.
commit
()
# link with the resource
# link with the resource
node_resource
=
Node_Resource
(
node_resource
=
Node_Resource
(
node_id
=
corpus
.
id
,
node_id
=
corpus
.
id
,
resource_id
=
resource
.
id
,
resource_id
=
resource
.
id
,
parsed
=
False
,
parsed
=
False
,
)
)
session
.
add
(
node_resource
)
mysession
.
add
(
node_resource
)
session
.
commit
()
mysession
.
commit
()
# return result
return
resource
return
resource
if
sessionToRemove
:
session
.
remove
()
def
parse_resources
(
corpus
,
user
=
None
,
user_id
=
None
,
session
=
None
):
def
parse_resources
(
corpus
,
user
=
None
,
user_id
=
None
,
my
session
=
None
):
sessionToRemove
=
False
if
mysession
is
None
:
if
session
is
None
:
from
gargantext_web.db
import
session
session
=
get_session
()
mysession
=
session
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - parsing'
%
corpus
.
id
)
dbg
=
DebugTime
(
'Corpus #
%
d - parsing'
%
corpus
.
id
)
...
@@ -91,7 +86,7 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
...
@@ -91,7 +86,7 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
else
:
else
:
user_id
=
corpus
.
user_id
user_id
=
corpus
.
user_id
# find resource of the corpus
# find resource of the corpus
resources_query
=
(
session
resources_query
=
(
my
session
.
query
(
Resource
,
ResourceType
)
.
query
(
Resource
,
ResourceType
)
.
join
(
ResourceType
,
ResourceType
.
id
==
Resource
.
type_id
)
.
join
(
ResourceType
,
ResourceType
.
id
==
Resource
.
type_id
)
.
join
(
Node_Resource
,
Node_Resource
.
resource_id
==
Resource
.
id
)
.
join
(
Node_Resource
,
Node_Resource
.
resource_id
==
Resource
.
id
)
...
@@ -134,14 +129,14 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
...
@@ -134,14 +129,14 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
# TODO: mark node-resources associations as parsed
# TODO: mark node-resources associations as parsed
#
#
dbg
.
show
(
'insert
%
d documents'
%
len
(
nodes
))
dbg
.
show
(
'insert
%
d documents'
%
len
(
nodes
))
session
.
add_all
(
nodes
)
my
session
.
add_all
(
nodes
)
session
.
commit
()
my
session
.
commit
()
# now, index the hyperdata
# now, index the hyperdata
dbg
.
show
(
'insert hyperdata'
)
dbg
.
show
(
'insert hyperdata'
)
node_hyperdata_lists
=
defaultdict
(
list
)
node_hyperdata_lists
=
defaultdict
(
list
)
hyperdata_types
=
{
hyperdata_types
=
{
hyperdata
.
name
:
hyperdata
hyperdata
.
name
:
hyperdata
for
hyperdata
in
session
.
query
(
Hyperdata
)
for
hyperdata
in
my
session
.
query
(
Hyperdata
)
}
}
#print('hyperdata_types', hyperdata_types)
#print('hyperdata_types', hyperdata_types)
for
node
in
nodes
:
for
node
in
nodes
:
...
@@ -166,7 +161,7 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
...
@@ -166,7 +161,7 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
node_hyperdata_ngrams
=
set
()
node_hyperdata_ngrams
=
set
()
#for field in ['source', 'authors', 'journal']:
#for field in ['source', 'authors', 'journal']:
for
field
in
[
'journal'
,
'authors'
]:
for
field
in
[
'journal'
,
'authors'
]:
hyperdata_set
.
add
(
session
.
query
(
Hyperdata
.
id
)
.
filter
(
Hyperdata
.
name
==
field
)
.
first
()[
0
])
hyperdata_set
.
add
(
my
session
.
query
(
Hyperdata
.
id
)
.
filter
(
Hyperdata
.
name
==
field
)
.
first
()[
0
])
#print("hyperdata_set", hyperdata_set)
#print("hyperdata_set", hyperdata_set)
...
@@ -191,9 +186,6 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
...
@@ -191,9 +186,6 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
# mark the corpus as parsed
# mark the corpus as parsed
corpus
.
parsed
=
True
corpus
.
parsed
=
True
if
sessionToRemove
:
session
.
remove
()
# ngrams extraction
# ngrams extraction
from
.NgramsExtractors
import
EnglishNgramsExtractor
,
FrenchNgramsExtractor
,
NgramsExtractor
from
.NgramsExtractors
import
EnglishNgramsExtractor
,
FrenchNgramsExtractor
,
NgramsExtractor
from
nltk.tokenize
import
word_tokenize
,
wordpunct_tokenize
,
sent_tokenize
from
nltk.tokenize
import
word_tokenize
,
wordpunct_tokenize
,
sent_tokenize
...
@@ -222,18 +214,17 @@ class NgramsExtractors(defaultdict):
...
@@ -222,18 +214,17 @@ class NgramsExtractors(defaultdict):
ngramsextractors
=
NgramsExtractors
()
ngramsextractors
=
NgramsExtractors
()
def
extract_ngrams
(
corpus
,
keys
,
nlp
=
True
,
session
=
None
):
def
extract_ngrams
(
corpus
,
keys
,
nlp
=
True
,
my
session
=
None
):
sessionToRemove
=
False
if
mysession
is
None
:
if
session
is
None
:
from
gargantext_web.db
import
session
session
=
get_session
()
mysession
=
session
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - ngrams'
%
corpus
.
id
)
dbg
=
DebugTime
(
'Corpus #
%
d - ngrams'
%
corpus
.
id
)
default_language_iso2
=
None
if
corpus
.
language_id
is
None
else
cache
.
Language
[
corpus
.
language_id
]
.
iso2
default_language_iso2
=
None
if
corpus
.
language_id
is
None
else
cache
.
Language
[
corpus
.
language_id
]
.
iso2
# query the hyperdata associated with the given keys
# query the hyperdata associated with the given keys
columns
=
[
Node
.
id
,
Node
.
language_id
]
+
[
Node
.
hyperdata
[
key
]
for
key
in
keys
]
columns
=
[
Node
.
id
,
Node
.
language_id
]
+
[
Node
.
hyperdata
[
key
]
for
key
in
keys
]
hyperdata_query
=
(
session
hyperdata_query
=
(
my
session
.
query
(
*
columns
)
.
query
(
*
columns
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Document'
]
.
id
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Document'
]
.
id
)
...
@@ -242,7 +233,7 @@ def extract_ngrams(corpus, keys, nlp=True, session=None):
...
@@ -242,7 +233,7 @@ def extract_ngrams(corpus, keys, nlp=True, session=None):
dbg
.
show
(
'find ngrams'
)
dbg
.
show
(
'find ngrams'
)
languages_by_id
=
{
languages_by_id
=
{
language
.
id
:
language
.
iso2
language
.
id
:
language
.
iso2
for
language
in
session
.
query
(
Language
)
for
language
in
my
session
.
query
(
Language
)
}
}
ngrams_data
=
set
()
ngrams_data
=
set
()
...
@@ -321,9 +312,6 @@ def extract_ngrams(corpus, keys, nlp=True, session=None):
...
@@ -321,9 +312,6 @@ def extract_ngrams(corpus, keys, nlp=True, session=None):
# commit to database
# commit to database
db
.
commit
()
db
.
commit
()
if
sessionToRemove
:
session
.
remove
()
def
text_prepa
(
my_str
):
def
text_prepa
(
my_str
):
"""
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment