Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
55136392
Commit
55136392
authored
Jan 15, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FACTOR] One session of one workflow.
parent
29aa56bb
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
116 additions
and
59 deletions
+116
-59
celery.py
gargantext_web/celery.py
+17
-14
group.py
ngram/group.py
+6
-4
mapList.py
ngram/mapList.py
+14
-5
occurrences.py
ngram/occurrences.py
+9
-2
specificity.py
ngram/specificity.py
+15
-7
stop.py
ngram/stop.py
+6
-3
tfidf.py
ngram/tfidf.py
+14
-7
workflow.py
ngram/workflow.py
+8
-8
corpustools.py
parsing/corpustools.py
+27
-9
No files found.
gargantext_web/celery.py
View file @
55136392
...
...
@@ -35,25 +35,28 @@ def apply_workflow(corpus_id):
update_state
=
WorkflowTracking
()
session
=
get_session
()
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
try
:
session
=
get_session
()
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
update_state
.
processing_
(
corpus
,
"Parsing"
)
#cProfile.runctx('parse_resources(corpus)', global,locals)
parse_resources
(
corpus
)
update_state
.
processing_
(
corpus
,
"Parsing"
)
#cProfile.runctx('parse_resources(corpus)', global,locals)
parse_resources
(
corpus
,
session
=
session
)
update_state
.
processing_
(
corpus
,
"Terms extraction"
)
extract_ngrams
(
corpus
,
[
'title'
,
'abstract'
],
nlp
=
True
)
update_state
.
processing_
(
corpus
,
"Terms extraction"
)
extract_ngrams
(
corpus
,
[
'title'
,
'abstract'
],
nlp
=
True
,
session
=
session
)
# update_state.processing_(corpus, "")
ngram_workflow
(
corpus
)
# update_state.processing_(corpus, "")
ngram_workflow
(
corpus
,
session
=
session
)
#ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
#ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
print
(
"End of the Workflow for corpus
%
d"
%
(
corpus_id
))
update_state
.
processing_
(
corpus
,
"0"
)
session
.
remove
()
print
(
"End of the Workflow for corpus
%
d"
%
(
corpus_id
))
update_state
.
processing_
(
corpus
,
"0"
)
session
.
remove
()
except
:
session
.
remove
()
@
shared_task
def
empty_trash
(
corpus_id
):
...
...
ngram/group.py
View file @
55136392
...
...
@@ -23,7 +23,6 @@ from math import log
from
functools
import
reduce
def
getStemmer
(
corpus
):
'''
getStemmer :: Corpus -> Stemmer
...
...
@@ -48,11 +47,14 @@ def getStemmer(corpus):
return
(
stemIt
)
def
compute_groups
(
corpus
,
limit_inf
=
None
,
limit_sup
=
None
,
how
=
'Stem'
):
def
compute_groups
(
corpus
,
limit_inf
=
None
,
limit_sup
=
None
,
how
=
'Stem'
,
session
=
None
):
'''
group ngrams according to a function (stemming or lemming)
'''
session
=
get_session
()
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - group'
%
corpus
.
id
)
dbg
.
show
(
'Group'
)
...
...
@@ -140,4 +142,4 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
bulk_insert
(
NodeNgram
,
(
'node_id'
,
'ngram_id'
,
'weight'
),
[
data
for
data
in
list
(
miam_to_insert
)])
session
.
remove
()
if
sessionToRemove
:
session
.
remove
()
ngram/mapList.py
View file @
55136392
...
...
@@ -19,8 +19,12 @@ def compute_mapList(corpus,limit=500,n=1):
'''
According to Specificities and stoplist,
'''
session
=
get_session
()
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
monograms_part
=
0.005
monograms_limit
=
round
(
limit
*
monograms_part
)
multigrams_limit
=
limit
-
monograms_limit
...
...
@@ -87,10 +91,15 @@ def compute_mapList(corpus,limit=500,n=1):
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
dbg
.
show
(
'MapList computed'
)
session
.
remove
()
if
sessionToRemove
:
session
.
remove
()
def
insert_miam
(
corpus
,
ngrams
=
None
,
path_file_csv
=
None
):
session
=
get_session
()
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
node_miam
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
...
...
@@ -124,6 +133,6 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
file_csv
.
close
()
dbg
.
show
(
'Miam computed'
)
session
.
remove
()
if
sessionToRemove
:
session
.
remove
()
ngram/occurrences.py
View file @
55136392
...
...
@@ -5,7 +5,14 @@ from gargantext_web.db import get_or_create_node
from
admin.utils
import
DebugTime
def
compute_occs
(
corpus
):
session
=
get_session
()
'''
compute_occs :: Corpus -> IO ()
'''
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - OCCURRENCES'
%
corpus
.
id
)
dbg
.
show
(
'Calculate occurrences'
)
...
...
@@ -48,7 +55,7 @@ def compute_occs(corpus):
)
)
db
.
commit
()
session
.
remove
()
if
sessionToRemove
:
session
.
remove
()
#data = session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==occs_node.id).all()
...
...
ngram/specificity.py
View file @
55136392
...
...
@@ -15,12 +15,15 @@ from gargantext_web.db import NodeNgramNgram, NodeNodeNgram
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
def
specificity
(
cooc_id
=
None
,
corpus
=
None
,
limit
=
100
):
def
specificity
(
cooc_id
=
None
,
corpus
=
None
,
limit
=
100
,
session
=
None
):
'''
Compute the specificity, simple calculus.
'''
session
=
get_session
()
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
.
order_by
(
NodeNgramNgram
.
score
)
...
...
@@ -55,17 +58,22 @@ def specificity(cooc_id=None, corpus=None, limit=100):
bulk_insert
(
NodeNodeNgram
,
[
'nodex_id'
,
'nodey_id'
,
'ngram_id'
,
'score'
],
[
d
for
d
in
data
])
return
(
node
.
id
)
session
.
remove
()
if
sessionToRemove
:
session
.
remove
()
def
compute_specificity
(
corpus
,
limit
=
100
):
def
compute_specificity
(
corpus
,
limit
=
100
,
session
=
None
):
'''
Computing specificities as NodeNodeNgram.
All workflow is the following:
1) Compute the cooc matrix
2) Compute the specificity score, saving it in database, return its Node
'''
session
=
get_session
()
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - specificity'
%
corpus
.
id
)
list_cvalue
=
get_or_create_node
(
nodetype
=
'Cvalue'
,
corpus
=
corpus
,
session
=
session
)
...
...
@@ -73,7 +81,7 @@ def compute_specificity(corpus,limit=100):
specificity
(
cooc_id
=
cooc_id
,
corpus
=
corpus
,
limit
=
limit
)
dbg
.
show
(
'specificity'
)
session
.
remove
()
if
sessionToRemove
:
session
.
remove
()
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_specificity(corpus)
...
...
ngram/stop.py
View file @
55136392
...
...
@@ -75,11 +75,14 @@ def isStopWord(ngram, stop_words=None):
if
test_match
(
word
,
regex
)
is
True
:
return
(
True
)
def
compute_stop
(
corpus
,
limit
=
2000
,
debug
=
False
):
def
compute_stop
(
corpus
,
limit
=
2000
,
debug
=
False
,
session
=
None
):
'''
do some statitics on all stop lists of database of the same type
'''
session
=
get_session
()
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
stop_node_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
.
id
...
...
@@ -115,4 +118,4 @@ def compute_stop(corpus,limit=2000,debug=False):
stop
=
WeightedList
({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
stop
.
save
(
stop_node_id
)
session
.
remove
()
if
sessionToRemove
:
session
.
remove
()
ngram/tfidf.py
View file @
55136392
...
...
@@ -5,9 +5,13 @@ from gargantext_web.db import get_session, get_or_create_node
from
admin.utils
import
DebugTime
def
compute_tfidf
(
corpus
):
def
compute_tfidf
(
corpus
,
session
=
None
):
# compute terms frequency sum
session
=
get_session
()
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - TFIDF'
%
corpus
.
id
)
dbg
.
show
(
'calculate terms frequencies sums'
)
...
...
@@ -121,15 +125,18 @@ def compute_tfidf(corpus):
# the end!
db
.
commit
()
session
.
remove
()
if
sessionToRemove
:
session
.
remove
()
def
compute_tfidf_global
(
corpus
):
def
compute_tfidf_global
(
corpus
,
session
=
None
):
'''
Maybe improve this with:
#http://stackoverflow.com/questions/8674718/best-way-to-select-random-rows-postgresql
'''
session
=
get_session
()
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - tfidf global'
%
corpus
.
id
)
dbg
.
show
(
'calculate terms frequencies sums'
)
...
...
@@ -265,7 +272,7 @@ def compute_tfidf_global(corpus):
db
.
commit
()
dbg
.
show
(
'insert tfidf'
)
session
.
remove
()
if
sessionToRemove
:
session
.
remove
()
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_tfidf_global(corpus)
ngram/workflow.py
View file @
55136392
...
...
@@ -12,17 +12,17 @@ from gargantext_web.db import Node , NodeNgram
from
admin.utils
import
WorkflowTracking
def
ngram_workflow
(
corpus
,
n
=
5000
):
def
ngram_workflow
(
corpus
,
n
=
5000
,
session
=
None
):
'''
All the workflow to filter the ngrams.
'''
update_state
=
WorkflowTracking
()
update_state
.
processing_
(
corpus
,
"Stop words"
)
compute_stop
(
corpus
)
compute_stop
(
corpus
,
session
=
session
)
update_state
.
processing_
(
corpus
,
"TF-IDF global score"
)
compute_tfidf_global
(
corpus
)
compute_tfidf_global
(
corpus
,
session
=
session
)
part
=
round
(
n
*
0.9
)
...
...
@@ -32,7 +32,7 @@ def ngram_workflow(corpus, n=5000):
#print('spec part:', part)
update_state
.
processing_
(
corpus
,
"Specificity score"
)
compute_specificity
(
corpus
,
limit
=
part
)
compute_specificity
(
corpus
,
limit
=
part
,
session
=
session
)
part
=
round
(
part
*
0.8
)
...
...
@@ -41,18 +41,18 @@ def ngram_workflow(corpus, n=5000):
#print(limit_inf,limit_sup)
update_state
.
processing_
(
corpus
,
"Synonyms"
)
try
:
compute_groups
(
corpus
,
limit_inf
=
limit_inf
,
limit_sup
=
limit_sup
)
compute_groups
(
corpus
,
limit_inf
=
limit_inf
,
limit_sup
=
limit_sup
,
session
=
session
)
except
Exception
as
error
:
print
(
"Workflow Ngram Group error"
,
error
)
pass
update_state
.
processing_
(
corpus
,
"Map list terms"
)
compute_mapList
(
corpus
,
limit
=
1000
)
# size
compute_mapList
(
corpus
,
limit
=
1000
,
session
=
session
)
# size
update_state
.
processing_
(
corpus
,
"TF-IDF local score"
)
compute_tfidf
(
corpus
)
compute_tfidf
(
corpus
,
session
=
session
)
update_state
.
processing_
(
corpus
,
"Occurrences"
)
compute_occs
(
corpus
)
compute_occs
(
corpus
,
session
=
session
)
parsing/corpustools.py
View file @
55136392
...
...
@@ -31,8 +31,12 @@ parsers = Parsers()
# resources management
def
add_resource
(
corpus
,
**
kwargs
):
session
=
get_session
()
def
add_resource
(
corpus
,
session
=
None
,
**
kwargs
):
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
# only for tests
resource
=
Resource
(
guid
=
str
(
random
()),
**
kwargs
)
...
...
@@ -67,10 +71,16 @@ def add_resource(corpus, **kwargs):
session
.
commit
()
# return result
return
resource
session
.
remove
()
if
sessionToRemove
:
session
.
remove
()
def
parse_resources
(
corpus
,
user
=
None
,
user_id
=
None
):
session
=
get_session
()
def
parse_resources
(
corpus
,
user
=
None
,
user_id
=
None
,
session
=
None
):
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - parsing'
%
corpus
.
id
)
...
...
@@ -180,7 +190,9 @@ def parse_resources(corpus, user=None, user_id=None):
# mark the corpus as parsed
corpus
.
parsed
=
True
session
.
remove
()
if
sessionToRemove
:
session
.
remove
()
# ngrams extraction
from
.NgramsExtractors
import
EnglishNgramsExtractor
,
FrenchNgramsExtractor
,
NgramsExtractor
...
...
@@ -210,8 +222,12 @@ class NgramsExtractors(defaultdict):
ngramsextractors
=
NgramsExtractors
()
def
extract_ngrams
(
corpus
,
keys
,
nlp
=
True
):
session
=
get_session
()
def
extract_ngrams
(
corpus
,
keys
,
nlp
=
True
,
session
=
None
):
sessionToRemove
=
False
if
session
is
None
:
session
=
get_session
()
sessionToRemove
=
True
dbg
=
DebugTime
(
'Corpus #
%
d - ngrams'
%
corpus
.
id
)
default_language_iso2
=
None
if
corpus
.
language_id
is
None
else
cache
.
Language
[
corpus
.
language_id
]
.
iso2
...
...
@@ -304,7 +320,9 @@ def extract_ngrams(corpus, keys, nlp=True):
dbg
.
message
=
'insert
%
d associations'
%
len
(
node_ngram_data
)
# commit to database
db
.
commit
()
session
.
remove
()
if
sessionToRemove
:
session
.
remove
()
def
text_prepa
(
my_str
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment