Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
58aa990d
Commit
58aa990d
authored
Mar 14, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
maplist generation and better estimates for constants (thresholds)
parent
744ec7f1
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
419 additions
and
304 deletions
+419
-304
ngram_parsing_flow.png
doc/ngram_parsing_flow.png
+0
-0
constants.py
gargantext/constants.py
+17
-4
__init__.py
gargantext/util/toolchain/__init__.py
+27
-21
list_main.py
gargantext/util/toolchain/list_main.py
+35
-17
list_map.py
gargantext/util/toolchain/list_map.py
+99
-94
list_stop.py
gargantext/util/toolchain/list_stop.py
+31
-38
ngram_coocs_tempo.py
gargantext/util/toolchain/ngram_coocs_tempo.py
+66
-38
ngram_groups.py
gargantext/util/toolchain/ngram_groups.py
+29
-18
ngram_scores.py
gargantext/util/toolchain/ngram_scores.py
+59
-31
score_specificity.py
gargantext/util/toolchain/score_specificity.py
+39
-37
projects.py
gargantext/views/pages/projects.py
+17
-6
No files found.
doc/ngram_parsing_flow.png
0 → 100644
View file @
58aa990d
52.5 KB
gargantext/constants.py
View file @
58aa990d
...
@@ -9,9 +9,11 @@ LISTTYPES = {
...
@@ -9,9 +9,11 @@ LISTTYPES = {
'STOPLIST'
:
UnweightedList
,
'STOPLIST'
:
UnweightedList
,
'MAINLIST'
:
UnweightedList
,
'MAINLIST'
:
UnweightedList
,
'MAPLIST'
:
UnweightedList
,
'MAPLIST'
:
UnweightedList
,
'SPECIFICITY'
:
WeightedList
,
'OCCURRENCES'
:
WeightedContextIndex
,
'OCCURRENCES'
:
WeightedContextIndex
,
'COOCCURRENCES'
:
WeightedMatrix
,
'COOCCURRENCES'
:
WeightedMatrix
,
'TFIDF-CORPUS'
:
WeightedContextIndex
,
'TFIDF-CORPUS'
:
WeightedContextIndex
,
'TFIDF-GLOBAL'
:
WeightedContextIndex
,
}
}
NODETYPES
=
[
NODETYPES
=
[
...
@@ -92,10 +94,21 @@ RESOURCETYPES = [
...
@@ -92,10 +94,21 @@ RESOURCETYPES = [
# },
# },
]
]
# linguistic extraction parameters
# linguistic extraction parameters ---------------------------------------------
DEFAULT_TFIDF_CUTOFF_RATIO
=
.55
# for MAINLIST maximum terms
DEFAULT_TFIDF_CUTOFF_RATIO
=
.45
# MAINLIST maximum terms in %
DEFAULT_TFIDF_HARD_LIMIT
=
1000
# for MAINLIST maximum terms
DEFAULT_COOC_THRESHOLD
=
4
# for COOCCURRENCES node
DEFAULT_TFIDF_HARD_LIMIT
=
750
# MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD
=
5
# inclusive minimum for COOCS coefs
# (makes COOCS more sparse)
DEFAULT_MAPLIST_MAX
=
300
# MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO
=
.5
# part of monograms in MAPLIST
# (NB: used to be 0.005 !!)
# ------------------------------------------------------------------------------
# other parameters
# other parameters
# default number of docs POSTed to scrappers.views.py
# default number of docs POSTed to scrappers.views.py
...
...
gargantext/util/toolchain/__init__.py
View file @
58aa990d
from
.parsing
import
parse
from
.parsing
import
parse
from
.ngrams_extraction
import
extract_ngrams
from
.ngrams_extraction
import
extract_ngrams
# in usual run order
from
.list_stop
import
do_stoplist
from
.list_stop
import
do_stoplist
from
.ngram_scores
import
compute_occurrences_local
,
compute_tfidf
from
.ngram_scores
import
compute_occurrences_local
,
compute_tfidf
from
.list_main
import
do_mainlist
from
.list_main
import
do_mainlist
from
.ngram_coocs_tempo
import
compute_coocs
from
.ngram_coocs_tempo
import
compute_coocs
from
.score_specificity
import
compute_specificity
from
.score_specificity
import
compute_specificity
from
.list_map
import
compute_mapL
ist
# TEST
from
.list_map
import
do_mapl
ist
# TEST
from
.ngram_groups
import
compute_groups
from
.ngram_groups
import
compute_groups
from
gargantext.util.db
import
session
from
gargantext.util.db
import
session
...
@@ -40,10 +41,19 @@ def parse_extract(corpus):
...
@@ -40,10 +41,19 @@ def parse_extract(corpus):
# -------------------------------
# -------------------------------
print
(
'CORPUS #
%
d: [
%
s] starting ngram lists computation'
%
(
corpus
.
id
,
t
()))
print
(
'CORPUS #
%
d: [
%
s] starting ngram lists computation'
%
(
corpus
.
id
,
t
()))
# -> stoplist:
compute + write (=>
Node and NodeNgram)
# -> stoplist:
filter + write (to
Node and NodeNgram)
stop_id
=
compute_stop
(
corpus
)
stop_id
=
do_stoplist
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new stoplist node #
%
i'
%
(
corpus
.
id
,
t
(),
stop_id
))
print
(
'CORPUS #
%
d: [
%
s] new stoplist node #
%
i'
%
(
corpus
.
id
,
t
(),
stop_id
))
# -> write groups to Node and NodeNgramNgram
group_id
=
compute_groups
(
corpus
,
stoplist_id
=
None
)
print
(
'CORPUS #
%
d: [
%
s] new grouplist node #
%
i'
%
(
corpus
.
id
,
t
(),
group_id
))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id
=
compute_occurrences_local
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new occs node #
%
i'
%
(
corpus
.
id
,
t
(),
occ_id
))
# ------------
# -> write local tfidf to Node and NodeNodeNgram
# -> write local tfidf to Node and NodeNodeNgram
ltfidf_id
=
compute_tfidf
(
corpus
,
scope
=
"local"
)
ltfidf_id
=
compute_tfidf
(
corpus
,
scope
=
"local"
)
print
(
'CORPUS #
%
d: [
%
s] new localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf_id
))
print
(
'CORPUS #
%
d: [
%
s] new localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf_id
))
...
@@ -52,31 +62,27 @@ def parse_extract(corpus):
...
@@ -52,31 +62,27 @@ def parse_extract(corpus):
gtfidf_id
=
compute_tfidf
(
corpus
,
scope
=
"global"
)
gtfidf_id
=
compute_tfidf
(
corpus
,
scope
=
"global"
)
print
(
'CORPUS #
%
d: [
%
s] new globaltfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
gtfidf_id
))
print
(
'CORPUS #
%
d: [
%
s] new globaltfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
gtfidf_id
))
# -> mainlist: compute + write (to Node and NodeNgram)
# -> mainlist: filter + write (to Node and NodeNgram)
mainlist_id
=
mainlist_filter
(
corpus
,
tfidf_id
=
gtfidf_id
,
stoplist_id
=
stop_id
)
mainlist_id
=
do_mainlist
(
corpus
,
tfidf_id
=
gtfidf_id
,
stoplist_id
=
stop_id
)
print
(
'CORPUS #
%
d: [
%
s] new mainlist node #
%
i'
%
(
corpus
.
id
,
t
(),
mainlist_id
))
print
(
'CORPUS #
%
d: [
%
s] new mainlist node #
%
i'
%
(
corpus
.
id
,
t
(),
mainlist_id
))
# ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id
=
compute_coocs
(
corpus
,
mainlist_id
=
mainlist_id
,
stop_id
=
None
)
cooc_id
=
compute_coocs
(
corpus
,
mainlist_id
=
mainlist_id
)
print
(
'CORPUS #
%
d: [
%
s] new cooc
c
s node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc_id
))
print
(
'CORPUS #
%
d: [
%
s] new coocs node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc_id
))
#
??
specificity: compute + write (=> NodeNodeNgram)
#
->
specificity: compute + write (=> NodeNodeNgram)
spec_id
=
compute_specificity
(
corpus
,
cooc_id
=
cooc_id
)
spec_id
=
compute_specificity
(
corpus
,
cooc_id
=
cooc_id
)
print
(
'CORPUS #
%
d: [
%
s] new specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
coo
c_id
))
print
(
'CORPUS #
%
d: [
%
s] new specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
spe
c_id
))
# ?? maplist: compute + write (to Node and NodeNgram)
# ?? maplist: compute + write (to Node and NodeNgram)
# map_id = compute_stop(corpus)
map_id
=
do_maplist
(
corpus
,
# print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
mainlist_id
=
mainlist_id
,
specificity_id
=
spec_id
,
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
grouplist_id
=
group_id
)
occ_id
=
compute_occurrences_local
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new maplist node #
%
i'
%
(
corpus
.
id
,
t
(),
map_id
))
print
(
'CORPUS #
%
d: [
%
s] new occs node #
%
i'
%
(
corpus
.
id
,
t
(),
occ_id
))
# -> write groups to Node and NodeNgramNgram
group_id
=
compute_groups
(
corpus
,
stoplist_id
=
None
)
print
(
'CORPUS #
%
d: [
%
s] new grouplist node #
%
i'
%
(
corpus
.
id
,
t
(),
group_id
))
def
t
():
def
t
():
...
...
gargantext/util/toolchain/list_main.py
View file @
58aa990d
...
@@ -2,26 +2,38 @@ from gargantext.models import Node, NodeNgram, NodeNodeNgram
...
@@ -2,26 +2,38 @@ from gargantext.models import Node, NodeNgram, NodeNodeNgram
from
gargantext.util.db
import
session
from
gargantext.util.db
import
session
from
gargantext.util.lists
import
UnweightedList
from
gargantext.util.lists
import
UnweightedList
from
sqlalchemy
import
desc
from
sqlalchemy
import
desc
from
gargantext.constants
import
DEFAULT_TFIDF_CUTOFF_RATIO
,
DEFAULT_TFIDF_HARD_LIMIT
from
gargantext.constants
import
DEFAULT_TFIDF_CUTOFF_RATIO
,
\
from
math
import
floor
DEFAULT_TFIDF_HARD_LIMIT
def
do_mainlist
(
corpus
,
tfidf_id
=
None
,
stoplist_id
=
None
,
def
do_mainlist
(
corpus
,
overwrite_id
=
None
,
tfidf_id
=
None
,
stoplist_id
=
None
,
hard_limit
=
DEFAULT_TFIDF_HARD_LIMIT
,
hard_limit
=
DEFAULT_TFIDF_HARD_LIMIT
,
ratio_limit
=
DEFAULT_TFIDF_CUTOFF_RATIO
ratio_limit
=
DEFAULT_TFIDF_CUTOFF_RATIO
):
):
"""
"""
Select t
erms for the mainlist according to a global tfidf and stoplist
.
Select t
op n terms according to a global tfidf ranking and stoplist filter
.
The number of selected terms will be:
The number of selected terms will be:
min(hard_limit, number_of_terms * ratio_limit)
min(hard_limit, number_of_terms * ratio_limit)
NB : We use a global tfidf node where the values are global but the ngrams
NB : We use a global tfidf node where the values are global but the ngrams
are already selected (== only within this corpus documents).
are already selected (== only within this corpus documents).
TO DISCUSS: allow influence of the local tfidf scores too
Parameters:
Parameters:
2 limits are useful to set a maximum amount of picked terms
- the corpus itself
- ratio_limit: relative to the number of distinct ngrams [0,1]
- a tfidf score for ranking the ngrams
- hard_limit: absolute value [default: 1000]
- a stoplist for filtering some ngrams
- overwrite_id: optional id of a pre-existing MAINLIST node for this corpus
(the Node and its previous NodeNgram rows will be replaced)
+ 2 limits to set the amount of picked terms:
- ratio_limit ∈ [0,1]: a ratio relative to the number of distinct ngrams
(default: 0.55)
- hard_limit: an absolute max value
(default: 1000)
"""
"""
# retrieve helper nodes if not provided
# retrieve helper nodes if not provided
...
@@ -61,20 +73,26 @@ def do_mainlist(corpus, tfidf_id=None, stoplist_id=None,
...
@@ -61,20 +73,26 @@ def do_mainlist(corpus, tfidf_id=None, stoplist_id=None,
nb_ngrams
=
ordered_filtered_tfidf
.
count
()
nb_ngrams
=
ordered_filtered_tfidf
.
count
()
# apply ratio to find smallest limit
# apply ratio to find smallest limit
our_limit
=
min
(
hard_limit
,
floor
(
nb_ngrams
*
ratio_limit
))
our_limit
=
min
(
hard_limit
,
round
(
nb_ngrams
*
ratio_limit
))
print
(
"MAINLIST: keeping
%
i ngrams out of
%
i"
%
(
our_limit
,
nb_ngrams
))
# DB retrieve up to limit => MAINLIST
# DB retrieve up to limit => MAINLIST
top_ngrams_ids
=
ordered_filtered_tfidf
.
limit
(
our_limit
)
.
all
()
top_ngrams_ids
=
ordered_filtered_tfidf
.
limit
(
our_limit
)
.
all
()
# now create the new MAINLIST node
if
overwrite_id
:
mainlist
=
corpus
.
add_child
(
# overwrite pre-existing id
typename
=
"MAINLIST"
,
the_id
=
overwrite_id
name
=
"Mainlist (in:
%
s)"
%
corpus
.
name
[
0
:
10
]
# mainlist = cache.Node[overwrite_id]
)
else
:
session
.
add
(
mainlist
)
# now create the new MAINLIST node
session
.
commit
()
mainlist
=
corpus
.
add_child
(
typename
=
"MAINLIST"
,
the_id
=
mainlist
.
id
name
=
"Mainlist (in:
%
s)"
%
corpus
.
id
)
session
.
add
(
mainlist
)
session
.
commit
()
the_id
=
mainlist
.
id
# create UnweightedList object and save (=> new NodeNgram rows)
# create UnweightedList object and save (=> new NodeNgram rows)
UnweightedList
(
top_ngrams_ids
)
.
save
(
the_id
)
UnweightedList
(
top_ngrams_ids
)
.
save
(
the_id
)
...
...
gargantext/util/toolchain/list_map.py
View file @
58aa990d
from
gargantext.util.db
import
*
"""
from
gargantext.util.db_cache
import
*
Selects a subset of corpus ngrams to use in the graph map.
from
gargantext.constants
import
*
"""
from
gargantext.models.ngrams
import
Ngram
,
NodeNgram
,
\
from
gargantext.models.ngrams
import
Node
,
Ngram
,
NodeNgram
,
\
NodeNodeNgram
,
NodeNgramNgram
NodeNgramNgram
,
NodeNodeNgram
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.util.db_cache
import
cache
from
gargantext.util.lists
import
UnweightedList
from
sqlalchemy
import
desc
from
gargantext.constants
import
DEFAULT_MAPLIST_MAX
,
\
DEFAULT_MAPLIST_MONOGRAMS_RATIO
def
do_maplist
(
corpus
,
overwrite_id
=
None
,
mainlist_id
=
None
,
specificity_id
=
None
,
grouplist_id
=
None
,
limit
=
DEFAULT_MAPLIST_MAX
,
monograms_part
=
DEFAULT_MAPLIST_MONOGRAMS_RATIO
):
'''
According to Specificities and mainlist
from
sqlalchemy.sql
import
func
Parameters:
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
- mainlist_id (starting point, already cleaned of stoplist terms)
from
sqlalchemy
import
literal_column
- specificity_id (ranking factor)
from
sqlalchemy.orm
import
aliased
- grouplist_id (filtering grouped ones)
- overwrite_id: optional if preexisting MAPLIST node to overwrite
from
gargantext.util.toolchain.ngram_tools
import
insert_ngrams
import
csv
def
compute_mapList
(
corpus_id
,
limit
=
500
,
n
=
1
,
session
=
None
):
+ 2 constants to modulate the terms choice
'''
- limit for the amount of picked terms
According to Specificities and stoplist,
- monograms_part: a ratio of terms with only one lexical unit to keep
'''
'''
if
not
(
mainlist_id
and
specificity_id
and
grouplist_id
):
monograms_part
=
0.005
raise
ValueError
(
"Please provide mainlist_id, specificity_id and grouplist_id"
)
monograms_limit
=
round
(
limit
*
monograms_part
)
monograms_limit
=
round
(
limit
*
monograms_part
)
multigrams_limit
=
limit
-
monograms_limit
multigrams_limit
=
limit
-
monograms_limit
print
(
"MAPLIST: monograms_limit ="
,
monograms_limit
)
print
(
"MAPLIST: multigrams_limit = "
,
multigrams_limit
)
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
list_main_id
=
session
.
query
(
Node
.
id
)
.
filter
(
mainterms_subquery
=
(
session
Node
.
typename
==
"MAINLIST"
,
# we want only terms within mainlist
Node
.
parent_id
==
corpus_id
)
.
first
()
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
mainlist_id
)
list_stop_id
=
session
.
query
(
Node
.
id
)
.
filter
(
.
subquery
()
Node
.
typename
==
"STOPLIST"
,
)
Node
.
parent_id
==
corpus_id
)
.
first
()
primary_groupterms_subquery
=
(
session
list_group_id
=
session
.
query
(
Node
.
id
)
.
filter
(
# we want only primary terms (ngram1)
Node
.
typename
==
"GROUPLIST"
,
.
query
(
NodeNgramNgram
.
ngram1_id
)
Node
.
parent_id
==
corpus_id
)
.
first
()
.
filter
(
NodeNgramNgram
.
node_id
==
grouplist_id
)
.
subquery
()
score_spec_id
=
session
.
query
(
Node
.
id
)
.
filter
(
)
Node
.
typename
==
"SPECIFICITY"
,
Node
.
parent_id
==
corpus_id
)
.
first
()
ScoreSpec
=
aliased
(
NodeNgram
)
# specificity-ranked
ListMain
=
aliased
(
NodeNgram
)
query
=
(
session
.
query
(
ScoreSpec
.
ngram_id
)
ListStop
=
aliased
(
NodeNgram
)
ListGroup
=
aliased
(
NodeNgramNgram
)
ScoreSpec
=
aliased
(
NodeNodeNgram
)
# FIXME outerjoin does not work with current SqlAlchemy
# lines below the query do the job but it can be improved
query
=
(
session
.
query
(
ScoreSpec
.
ngram_id
,
ScoreSpec
.
score
)
.
join
(
ListMain
,
ScoreSpec
.
ngram_id
==
ListMain
.
ngram_id
)
.
join
(
Ngram
,
Ngram
.
id
==
ScoreSpec
.
ngram_id
)
.
join
(
Ngram
,
Ngram
.
id
==
ScoreSpec
.
ngram_id
)
#.outerjoin(ListGroup, Group.ngramy_id == ScoreSpec.ngram_id)
.
filter
(
ScoreSpec
.
node_id
==
specificity_id
)
#.outerjoin(ListStop, Stop.ngram_id == ScoreSpec.ngram_id)
.
filter
(
ScoreSpec
.
ngram_id
.
in_
(
mainterms_subquery
))
.
filter
(
ListMain
.
node_id
==
list_main_id
)
.
filter
(
ScoreSpec
.
ngram_id
.
in_
(
primary_groupterms_subquery
))
#.filter(ListGroup.node_id == list_group_id)
#.filter(ListStop.node_id == list_stop_id)
.
filter
(
ScoreSpec
.
nodex_id
==
score_spec_id
)
)
)
top_monograms
=
(
query
top_monograms
=
(
query
.
filter
(
Ngram
.
n
==
1
)
.
filter
(
Ngram
.
n
==
1
)
.
order_by
(
desc
(
ScoreSpec
.
score
))
.
order_by
(
desc
(
ScoreSpec
.
weight
))
.
limit
(
monograms_limit
)
.
limit
(
monograms_limit
)
.
all
()
)
)
top_multigrams
=
(
query
top_multigrams
=
(
query
.
filter
(
Ngram
.
n
>=
2
)
.
filter
(
Ngram
.
n
>=
2
)
.
order_by
(
desc
(
ScoreSpec
.
score
))
.
order_by
(
desc
(
ScoreSpec
.
weight
))
.
limit
(
multigrams_limit
)
.
limit
(
multigrams_limit
)
.
all
()
)
)
stop_ngrams
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
print
(
"MAPLIST: top_monograms ="
,
len
(
top_monograms
))
.
filter
(
NodeNgram
.
node_id
==
list_stop_id
)
print
(
"MAPLIST: top_multigrams = "
,
len
(
top_multigrams
))
.
all
()
)
# NEW MAPLIST NODE
# -----------------
grouped_ngrams
=
(
session
.
query
(
NodeNgramNgram
.
ngramy_id
)
# saving the parameters of the analysis in the Node JSON
.
filter
(
NodeNgramNgram
.
node_id
==
list_group_id
)
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
.
all
()
'limit'
:
limit
,
'monograms_part'
:
monograms_part
}
if
overwrite_id
:
# overwrite pre-existing node
the_maplist
=
cache
.
Node
[
overwrite_id
]
the_maplist
.
hyperdata
=
new_hyperdata
the_maplist
.
save_hyperdata
()
session
.
commit
()
the_id
=
overwrite_id
else
:
# create a new maplist node
the_maplist
=
corpus
.
add_child
(
name
=
"Maplist (in
%
i)"
%
corpus
.
id
,
typename
=
"MAPLIST"
,
hyperdata
=
new_hyperdata
)
)
session
.
add
(
the_maplist
)
list_map_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus_id
,
Node
.
typename
==
"MAPLIST"
)
.
first
()
if
list_map_id
==
None
:
corpus
=
cache
.
Node
[
corpus_id
]
user_id
=
corpus
.
user_id
list_map
=
Node
(
name
=
"MAPLIST"
,
parent_id
=
corpus_id
,
user_id
=
user_id
,
typename
=
"MAPLIST"
)
session
.
add
(
list_map
)
session
.
commit
()
session
.
commit
()
list_map_id
=
list_map
.
id
the_id
=
the_maplist
.
id
# create UnweightedList object and save (=> new NodeNgram rows)
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
list_map_id
)
.
delete
()
datalist
=
UnweightedList
(
session
.
commit
()
[
res
.
ngram_id
for
res
in
top_monograms
+
top_multigrams
]
)
data
=
zip
(
[
list_map_id
for
i
in
range
(
1
,
limit
)]
# save
,
[
n
[
0
]
for
n
in
list
(
top_multigrams
)
+
list
(
top_monograms
)
datalist
.
save
(
the_id
)
if
(
n
[
0
],)
not
in
list
(
stop_ngrams
)
]
# dbg.show('MapList computed')
,
[
1
for
i
in
range
(
1
,
limit
)]
)
#print([d for d in data])
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
dbg
.
show
(
'MapList computed'
)
return
the_id
gargantext/util/toolchain/list_stop.py
View file @
58aa990d
from
gargantext.util.db
import
*
"""
from
gargantext.util.db_cache
import
*
Creates a filtering list for corpus ngrams.
from
gargantext.constants
import
*
(implementation: regexp + "master" stoplist)
"""
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.util.lists
import
WeightedMatrix
from
gargantext.models
import
User
,
Node
,
Ngram
,
NodeNgram
from
gargantext.models
import
User
,
Node
,
Ngram
,
NodeNgram
from
gargantext.util.db
import
session
,
func
from
gargantext.constants
import
LISTTYPES
from
re
import
compile
from
sqlalchemy
import
desc
import
re
def
is_stop_word
(
ngram
,
stop_words
=
None
):
from
sqlalchemy
import
desc
,
asc
#from ngram.tools import insert_ngrams
def
isStopWord
(
ngram
,
stop_words
=
None
):
'''
'''
ngram :: (Int, String) => (ngram_id, ngram_terms)
ngram :: (Int, String) => (ngram_id, ngram_terms)
stop_words :: Set of String
stop_words :: Set of String
(to avoid SQL query each time is
StopW
ord is invoked, get in as parameter)
(to avoid SQL query each time is
_stop_w
ord is invoked, get in as parameter)
'''
'''
word
=
ngram
[
1
]
word
=
ngram
[
1
]
...
@@ -41,7 +39,7 @@ def isStopWord(ngram, stop_words=None):
...
@@ -41,7 +39,7 @@ def isStopWord(ngram, stop_words=None):
,
"(.*)(travers)(.*)"
,
"(.*)(travers)(.*)"
,
"(.*)(:|
\
|)(.*)"
,
"(.*)(:|
\
|)(.*)"
]
:
]
:
compiled_regexes
.
append
(
re
.
compile
(
regex
))
compiled_regexes
.
append
(
compile
(
regex
))
for
format_regex
in
compiled_regexes
:
for
format_regex
in
compiled_regexes
:
if
format_regex
.
match
(
word
):
if
format_regex
.
match
(
word
):
...
@@ -61,32 +59,27 @@ def create_gargantua_resources():
...
@@ -61,32 +59,27 @@ def create_gargantua_resources():
session
.
add
(
stopList
)
session
.
add
(
stopList
)
session
.
commit
()
session
.
commit
()
def
compute_stop
(
corpus
,
stopList_id
=
None
,
debug
=
Fals
e
):
def
do_stoplist
(
corpus
,
overwrite_id
=
Non
e
):
'''
'''
Create list of stop words.
Create list of stop words.
TODO do a function to get all stop words with social scores
TODO do a function to get all stop words with social scores
Parameters:
- overwrite_id: optional preexisting STOPLIST node to overwrite
'''
'''
# Get the StopList if it exist or create a new one
# Get preexisting StopList if provided in overwrite_id param
if
overwrite_id
:
stoplist_id
=
overwrite_id
# At this step of development, a new StopList should be created
# At this step of development, a new StopList should be created
if
stopList_id
==
None
:
else
:
stopList_id
=
session
.
query
(
Node
.
id
)
.
filter
(
stoplist
=
corpus
.
add_child
(
Node
.
parent_id
==
corpus
.
id
,
name
=
"Stoplist (in:
%
s)"
%
corpus
.
id
,
Node
.
typename
==
"STOPLIST"
typename
=
"STOPLIST"
)
.
first
()
)
if
stopList_id
==
None
:
session
.
add
(
stoplist
)
stopList
=
Node
(
name
=
"STOPLIST"
,
parent_id
=
corpus
.
id
,
user_id
=
corpus
.
user_id
,
typename
=
"STOPLIST"
)
session
.
add
(
stopList
)
session
.
commit
()
stopList_id
=
stopList
.
id
# For tests only
if
debug
==
True
:
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
stopList_id
)
.
delete
()
session
.
commit
()
session
.
commit
()
stoplist_id
=
stoplist
.
id
# Get common resources, all common StopWords on the platform
# Get common resources, all common StopWords on the platform
## First get the id of the StopList of Gargantua super user
## First get the id of the StopList of Gargantua super user
...
@@ -107,23 +100,23 @@ def compute_stop(corpus, stopList_id=None, debug=False):
...
@@ -107,23 +100,23 @@ def compute_stop(corpus, stopList_id=None, debug=False):
## Get the ngrams
## Get the ngrams
## ngrams :: [(Int, String, Int)]
## ngrams :: [(Int, String, Int)]
frequency
=
func
.
count
(
NodeNgram
.
weight
)
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
)
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
typename
==
"DOCUMENT"
)
Node
.
typename
==
"DOCUMENT"
)
.
group_by
(
Ngram
.
id
)
.
group_by
(
Ngram
.
id
)
.
order_by
(
desc
(
frequency
)
)
#.limit(limit)
#.limit(limit)
.
all
()
.
all
()
)
)
ngrams_to_stop
=
filter
(
lambda
x
:
isStopWord
(
x
,
stop_words
=
stop_words
),
ngrams
)
ngrams_to_stop
=
filter
(
lambda
x
:
is_stop_word
(
x
,
stop_words
=
stop_words
),
ngrams
)
# print([n for n in ngrams_to_stop])
# print([n for n in ngrams_to_stop])
stop
=
LISTTYPES
[
"STOPLIST"
]({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
stop
=
LISTTYPES
[
"STOPLIST"
]({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
# stop = LISTTYPES["STOPLIST"]([n[0] for n in ngrams_to_stop])
# stop = LISTTYPES["STOPLIST"]([n[0] for n in ngrams_to_stop])
stop
.
save
(
stop
L
ist_id
)
stop
.
save
(
stop
l
ist_id
)
return
stop
L
ist_id
return
stop
l
ist_id
gargantext/util/toolchain/ngram_coocs_tempo.py
View file @
58aa990d
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNgramNgram
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNgramNgram
from
gargantext.util.lists
import
WeightedMatrix
from
gargantext.util.lists
import
WeightedMatrix
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.constants
import
DEFAULT_COOC_THRESHOLD
from
gargantext.util.db_cache
import
cache
from
gargantext.constants
import
DEFAULT_COOC_THRESHOLD
def
compute_coocs
(
corpus
,
def
compute_coocs
(
corpus
,
threshold
=
DEFAULT_COOC_THRESHOLD
,
overwrite_id
=
None
,
weighted
=
False
,
threshold
=
DEFAULT_COOC_THRESHOLD
,
our_id
=
None
,
mainlist_id
=
None
,
stop
_id
=
None
,
stop
list_id
=
None
,
symmetry_filter
=
True
):
symmetry_filter
=
True
):
"""
"""
Count how often some extracted terms appear
Count how often some extracted terms appear
together in a small context (document)
together in a small context (document)
throughout a larger context (corpus).
throughout a larger context (corpus).
node_id | ngram_id | weight ngram1_id | ngram2_id | ucooc | wcooc |
[NodeNgram] [NodeNgramNgram]
--------+----------+-------- ----------+-----------+-------+-------+
MYDOC | 487 | 1 => 487 | 294 | 1 | 4 |
node_id | ngram_id | weight ngram1_id | ngram2_id | score |
MYDOC | 294 | 3
--------+----------+-------- ----------+-----------+-------+
MYDOCA | 487 | 1 => 487 | 294 | 2 |
MYDOCA | 294 | 3
MYDOCB | 487 | 1
MYDOCB | 294 | 4
Fill that info in DB:
Fill that info in DB:
- a *new* COOCCURRENCES node
- a *new* COOCCURRENCES node
...
@@ -25,14 +30,16 @@ def compute_coocs(corpus,
...
@@ -25,14 +30,16 @@ def compute_coocs(corpus,
worse case complexity ~ O(N²/2) with N = number of ngrams
worse case complexity ~ O(N²/2) with N = number of ngrams
If a mainlist is provided, we filter doc ngrams to those also in the list.
Parameters:
Parameters:
- th
reshold: on output ucooc count (previously called hapax)
- th
e corpus node
-
weighted: if False normal cooc to be saved as result
-
overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
if True weighted cooc (experimental
)
(all hyperdata and previous NodeNgramNgram rows will be replaced
)
-
stop_id: stoplist for filtering input ngrams
-
threshold: on output cooc count (previously called hapax)
-
TODO cvalue_id: allow a metric as input filter
-
mainlist_id: mainlist to constrain the input ngrams
-
TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
-
stoplist_id: stoplist for filtering input ngrams
- TODO start, end : filter on document date
(normally unnecessary if a mainlist is provided)
(deprecated parameters)
(deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
...
@@ -54,14 +61,17 @@ def compute_coocs(corpus,
...
@@ -54,14 +61,17 @@ def compute_coocs(corpus,
coocs for each doc :
coocs for each doc :
- each given pair like (termA, termB) will likely appear several times
- each given pair like (termA, termB) will likely appear several times
=> we do GROUP BY (x1.ngram_id, x2.ngram_id)
=> we do GROUP BY (x1.ngram_id, x2.ngram_id)
-
normally we can count unique appearances of the pair (u
cooc)
-
we count unique appearances of the pair (
cooc)
- we can count sum of sum of weights in the pair (wcooc or cofreq)
TODO
====
use WeightedMatrix
"""
"""
# - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO start, end : filter on document date
# - TODO weighted: if False normal cooc to be saved as result
# if True weighted cooc (experimental)
# /!\ big combinatorial complexity /!\
# /!\ big combinatorial complexity /!\
# pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1
# pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1
# 1.859.408 lignes pour la requête cooc simple
# 1.859.408 lignes pour la requête cooc simple
...
@@ -94,10 +104,22 @@ def compute_coocs(corpus,
...
@@ -94,10 +104,22 @@ def compute_coocs(corpus,
# 2) INPUT FILTERS (reduce N before O(N²))
# 2) INPUT FILTERS (reduce N before O(N²))
# £TODO add possibility to restrict to the mainlist
# £TODO add possibility to restrict to the mainlist
if
stop_id
:
if
mainlist_id
:
main_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
mainlist_id
)
.
subquery
()
)
coocs_query
=
(
coocs_query
.
filter
(
x1
.
ngram_id
.
in_
(
main_subquery
)
)
.
filter
(
x2
.
ngram_id
.
in_
(
main_subquery
)
)
)
if
stoplist_id
:
stop_subquery
=
(
stop_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
stop_id
)
.
filter
(
NodeNgram
.
node_id
==
stop
list
_id
)
.
subquery
()
.
subquery
()
)
)
...
@@ -128,30 +150,36 @@ def compute_coocs(corpus,
...
@@ -128,30 +150,36 @@ def compute_coocs(corpus,
# 3) OUTPUT FILTERS
# 3) OUTPUT FILTERS
# ------------------
# ------------------
# threshold
# threshold
#
coocs_query
=
coocs_query
.
having
(
ucooc
>=
threshold
)
coocs_query
=
coocs_query
.
having
(
ucooc
>
threshold
)
# 4) EXECUTE QUERY
# 4) EXECUTE QUERY
# ----------------
# ----------------
# => storage in our matrix structure
# => storage in our matrix structure
matrix
=
WeightedMatrix
(
coocs_query
.
all
())
matrix
=
WeightedMatrix
(
coocs_query
.
all
())
# fyi
# shape_0 = len({pair[0] for pair in matrix.items})
# shape_1 = len({pair[1] for pair in matrix.items})
# print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
# 5) SAVE
# 5) SAVE
# --------
# --------
if
our_id
:
# saving the parameters of the analysis in the Node JSON
# use pre-existing id
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
the_id
=
our_id
'threshold'
:
threshold
}
if
overwrite_id
:
# overwrite pre-existing id
the_cooc
=
cache
.
Node
[
overwrite_id
]
the_cooc
.
hyperdata
=
new_hyperdata
the_cooc
.
save_hyperdata
()
session
.
commit
()
the_id
=
overwrite_id
else
:
else
:
# create the new cooc node
# create the new cooc node
the_cooc
=
Node
(
the_cooc
=
corpus
.
add_child
(
typename
=
"COOCCURRENCES"
,
typename
=
"COOCCURRENCES"
,
name
=
"Coocs (in:
%
s)"
%
corpus
.
name
[
0
:
10
],
name
=
"Coocs (in:
%
s)"
%
corpus
.
name
[
0
:
10
],
parent_id
=
corpus
.
id
,
hyperdata
=
new_hyperdata
,
user_id
=
corpus
.
user_id
,
# saving the parameters of the analysis in the Node JSON
hyperdata
=
{
'corpus'
:
corpus
.
id
,
'threshold'
:
threshold
}
)
)
session
.
add
(
the_cooc
)
session
.
add
(
the_cooc
)
session
.
commit
()
session
.
commit
()
...
...
gargantext/util/toolchain/ngram_groups.py
View file @
58aa990d
from
gargantext.models
import
Node
,
NodeNgramNgram
"""
from
gargantext.util.db
import
session
For initial ngram groups via stemming
from
gargantext.util.lists
import
Translations
Exemple:
- groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...}
- groups['post'] = {'poste':3, 'poster':5, 'postés':2...}
"""
from
gargantext.models
import
Node
,
NodeNgramNgram
from
gargantext.util.db
import
session
from
gargantext.util.lists
import
Translations
# to convert fr => french :/
# to convert fr => french :/
from
gargantext.util.languages
import
languages
from
gargantext.util.languages
import
languages
from
nltk.stem.snowball
import
SnowballStemmer
from
re
import
split
as
resplit
from
re
import
split
as
resplit
from
collections
import
defaultdict
,
Counter
from
collections
import
defaultdict
,
Count
er
from
nltk.stem.snowball
import
SnowballStemm
er
def
prepare_stemmers
(
corpus
):
def
prepare_stemmers
(
corpus
):
"""
"""
...
@@ -22,7 +29,7 @@ def prepare_stemmers(corpus):
...
@@ -22,7 +29,7 @@ def prepare_stemmers(corpus):
stemmers_by_lg
[
lgiso2
]
=
SnowballStemmer
(
lgname
)
stemmers_by_lg
[
lgiso2
]
=
SnowballStemmer
(
lgname
)
return
stemmers_by_lg
return
stemmers_by_lg
def
compute_groups
(
corpus
,
stoplist_id
=
None
):
def
compute_groups
(
corpus
,
stoplist_id
=
None
,
overwrite_id
=
None
):
"""
"""
1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma
1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma
2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
...
@@ -98,17 +105,21 @@ def compute_groups(corpus, stoplist_id = None):
...
@@ -98,17 +105,21 @@ def compute_groups(corpus, stoplist_id = None):
del
my_groups
del
my_groups
# 2) Create the list node
# 2) the list node
the_group
=
Node
()
if
overwrite_id
:
the_group
.
typename
=
"GROUPLIST"
# overwrite pre-existing id
the_group
.
name
=
"Group (src:
%
s)"
%
corpus
.
name
[
0
:
10
]
the_id
=
overwrite_id
the_group
.
parent_id
=
corpus
.
id
# could use corpus.parent_id if free list
# or create the new id
the_group
.
user_id
=
corpus
.
user_id
else
:
the_group
=
corpus
.
add_child
(
# and save the node
typename
=
"GROUPLIST"
,
session
.
add
(
the_group
)
name
=
"Group (src:
%
s)"
%
corpus
.
name
[
0
:
10
]
session
.
commit
()
)
the_id
=
the_group
.
id
# and save the node
session
.
add
(
the_group
)
session
.
commit
()
the_id
=
the_group
.
id
# 3) Save each grouping couple to DB thanks to Translations.save() table
# 3) Save each grouping couple to DB thanks to Translations.save() table
ndngng_list
=
Translations
(
ndngng_list
=
Translations
(
...
...
gargantext/util/toolchain/ngram_scores.py
View file @
58aa990d
"""
Computes ngram scores with 3 ranking functions:
- the simple sum of occurrences inside the corpus
- the tfidf inside the corpus
- the global tfidf for all corpora having same source
FIXME: "having the same source" means we need to select inside hyperdata
with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ...
"""
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
from
gargantext.util.db
import
session
,
bulk_insert
from
gargantext.util.db
import
session
,
bulk_insert
,
func
# = sqlalchemy.func like sum() or count()
from
sqlalchemy
import
text
from
sqlalchemy
import
text
# for query from raw SQL statement
from
math
import
log
# £TODO
# £TODO
# from gargantext.util.lists import WeightedContextIndex
# from gargantext.util.lists import WeightedContextIndex
from
gargantext.util.db
import
func
# = sqlalchemy.func like sum() or count()
from
math
import
log
def
compute_occurrences_local
(
corpus
):
def
compute_occurrences_local
(
corpus
,
overwrite_id
=
None
):
"""
"""
Calculates sum of occs per ngram within corpus
Calculates sum of occs per ngram within corpus
(used as info in the ngrams table view)
? optimize ? OCCS here could be calculated simultaneously within TFIDF-CORPUS loop
Parameters:
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
"""
# 1) all the doc_ids of our corpus (scope of counts for filter)
# 1) all the doc_ids of our corpus (scope of counts for filter)
...
@@ -37,32 +52,41 @@ def compute_occurrences_local(corpus):
...
@@ -37,32 +52,41 @@ def compute_occurrences_local(corpus):
# ^^^^ ^^^
# ^^^^ ^^^
# ngram_id sum_wei
# ngram_id sum_wei
# create the new OCCURRENCES node
occnode
=
Node
()
if
overwrite_id
:
occnode
.
typename
=
"OCCURRENCES"
# overwrite pre-existing id
occnode
.
name
=
"occ_sums (in:
%
s)"
%
corpus
.
id
the_id
=
overwrite_id
occnode
.
parent_id
=
corpus
.
id
# occnode = cache.Node[overwrite_id]
occnode
.
user_id
=
corpus
.
user_id
else
:
session
.
add
(
occnode
)
# create the new OCCURRENCES node
session
.
commit
()
occnode
=
corpus
.
add_child
(
typename
=
"OCCURRENCES"
,
name
=
"occ_sums (in:
%
s)"
%
corpus
.
id
)
session
.
add
(
occnode
)
session
.
commit
()
the_id
=
occnode
.
id
# reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf)
# reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf)
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert
(
bulk_insert
(
NodeNodeNgram
,
NodeNodeNgram
,
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
((
occnode
.
id
,
corpus
.
id
,
res
[
0
],
res
[
1
])
for
res
in
occ_sums
)
((
the_
id
,
corpus
.
id
,
res
[
0
],
res
[
1
])
for
res
in
occ_sums
)
)
)
return
occnode
.
id
return
the_
id
def
compute_tfidf
(
corpus
,
scope
=
"local"
):
def
compute_tfidf
(
corpus
,
scope
=
"local"
,
overwrite_id
=
None
):
"""
"""
Calculates tfidf within the current corpus
Calculates tfidf within the current corpus
Parameter:
Parameters:
- the corpus itself
- scope: {"local" or "global"}
- scope: {"local" or "global"}
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
"""
# local <=> within this corpus
# local <=> within this corpus
...
@@ -121,23 +145,27 @@ def compute_tfidf(corpus, scope="local"):
...
@@ -121,23 +145,27 @@ def compute_tfidf(corpus, scope="local"):
tfidfs
[
ngram_id
]
=
tf
*
(
log_tot_docs
-
log
(
nd
))
tfidfs
[
ngram_id
]
=
tf
*
(
log_tot_docs
-
log
(
nd
))
# -------------------------------------------------
# -------------------------------------------------
# create the new TFIDF-CORPUS node
if
overwrite_id
:
tfidf_nd
=
Node
(
parent_id
=
corpus
.
id
,
user_id
=
corpus
.
user_id
)
the_id
=
overwrite_id
if
scope
==
"local"
:
else
:
tfidf_nd
.
typename
=
"TFIDF-CORPUS"
# create the new TFIDF-XXXX node
tfidf_nd
.
name
=
"tfidf-c (in:
%
s)"
%
corpus
.
id
tfidf_nd
=
corpus
.
add_child
()
elif
scope
==
"global"
:
if
scope
==
"local"
:
tfidf_nd
.
typename
=
"TFIDF-GLOBAL"
tfidf_nd
.
typename
=
"TFIDF-CORPUS"
tfidf_nd
.
name
=
"tfidf-g (in type:
%
s)"
%
this_source_type
tfidf_nd
.
name
=
"tfidf-c (in:
%
s)"
%
corpus
.
id
session
.
add
(
tfidf_nd
)
elif
scope
==
"global"
:
session
.
commit
()
tfidf_nd
.
typename
=
"TFIDF-GLOBAL"
tfidf_nd
.
name
=
"tfidf-g (in type:
%
s)"
%
this_source_type
session
.
add
(
tfidf_nd
)
session
.
commit
()
the_id
=
tfidf_nd
.
id
# reflect that in NodeNodeNgrams
# reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert
(
bulk_insert
(
NodeNodeNgram
,
NodeNodeNgram
,
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
((
t
fidf_nd
.
id
,
corpus
.
id
,
ng
,
tfidfs
[
ng
])
for
ng
in
tfidfs
)
((
t
he_id
,
corpus
.
id
,
ng
,
tfidfs
[
ng
])
for
ng
in
tfidfs
)
)
)
return
t
fidf_nd
.
id
return
t
he_
id
gargantext/util/toolchain/score_specificity.py
View file @
58aa990d
from
gargantext.util.db
import
session
,
aliased
,
func
"""
from
gargantext.util.db_cache
import
*
Computes a specificity metric from the ngram cooccurrence matrix.
from
gargantext.constants
import
*
+ SAVE => WeightedList => NodeNgram
"""
# from gargantext.util.analysis.cooccurrences import do_cooc
from
gargantext.models
import
Node
,
Ngram
,
NodeNgram
,
NodeNgramNgram
from
gargantext.util.db
import
session
,
aliased
,
func
,
bulk_insert
from
gargantext.models
import
Node
,
Ngram
,
NodeNgramNgram
,
NodeNodeNgram
from
gargantext.util.lists
import
WeightedList
from
collections
import
defaultdict
import
pandas
as
pd
from
pandas
import
DataFrame
from
collections
import
defaultdict
def
compute_specificity
(
corpus
,
cooc_id
=
None
,
overwrite_id
=
None
):
def
compute_specificity
(
corpus
,
cooc_id
,
limit
=
100
):
'''
'''
Compute the specificity, simple calculus.
Compute the specificity, simple calculus.
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- overwrite_id: optional preexisting specificity node to overwrite
'''
'''
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
# no filtering: new choice filter on tfidf before creation
# .order_by(NodeNgramNgram.weight)
# .limit(limit)
)
)
# no filtering: new choice cooc already filtered on tfidf before creation
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
...
@@ -30,7 +31,9 @@ def compute_specificity(corpus, cooc_id, limit=100):
...
@@ -30,7 +31,9 @@ def compute_specificity(corpus, cooc_id, limit=100):
nb_ngrams
=
len
(
matrix
)
nb_ngrams
=
len
(
matrix
)
d
=
pd
.
DataFrame
(
matrix
)
.
fillna
(
0
)
print
(
"SPECIFICITY: computing on
%
i ngrams"
%
nb_ngrams
)
d
=
DataFrame
(
matrix
)
.
fillna
(
0
)
# proba (x/y) ( <= on divise chaque colonne par son total)
# proba (x/y) ( <= on divise chaque colonne par son total)
d
=
d
/
d
.
sum
(
axis
=
0
)
d
=
d
/
d
.
sum
(
axis
=
0
)
...
@@ -74,28 +77,27 @@ def compute_specificity(corpus, cooc_id, limit=100):
...
@@ -74,28 +77,27 @@ def compute_specificity(corpus, cooc_id, limit=100):
# ----------------
# ----------------
# specificity node
# specificity node
node
=
session
.
query
(
Node
)
.
filter
(
if
overwrite_id
:
Node
.
parent_id
==
corpus
.
id
,
# overwrite pre-existing id
Node
.
typename
==
"SPECIFICITY"
the_id
=
overwrite_id
)
.
first
()
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
node1_id
==
the_id
)
.
delete
()
session
.
commit
()
if
node
==
None
:
else
:
user_id
=
corpus
.
user_id
specnode
=
corpus
.
add_child
(
node
=
Node
(
name
=
"Specif (in:
%
i)"
%
corpus
.
id
,
typename
=
"SPECIFICITY"
,
parent_id
=
corpus
.
id
,
name
=
"Specif (in:
%
s)"
%
corpus
.
id
user_id
=
user_id
,
)
typename
=
"SPECIFICITY"
)
session
.
add
(
specnode
)
session
.
add
(
node
)
session
.
commit
()
session
.
commit
()
the_id
=
specnode
.
id
data
=
zip
(
[
node
.
id
]
*
nb_ngrams
# print(v)
,
[
corpus
.
id
]
*
nb_ngrams
,
v
.
index
.
tolist
()
,
v
.
values
.
tolist
()
)
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
node1_id
==
node
.
id
)
.
delete
()
session
.
commit
()
bulk_insert
(
NodeNodeNgram
,
[
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
],
[
d
for
d
in
data
])
data
=
WeightedList
(
zip
(
v
.
index
.
tolist
()
,
v
.
values
.
tolist
()
)
)
data
.
save
(
the_id
)
return
(
node
.
id
)
return
(
the_
id
)
gargantext/views/pages/projects.py
View file @
58aa990d
...
@@ -94,6 +94,7 @@ def project(request, project_id):
...
@@ -94,6 +94,7 @@ def project(request, project_id):
)
)
session
.
add
(
corpus
)
session
.
add
(
corpus
)
session
.
commit
()
session
.
commit
()
# parse_extract: fileparsing -> ngram extraction -> lists
scheduled
(
parse_extract
)(
corpus
.
id
)
scheduled
(
parse_extract
)(
corpus
.
id
)
# corpora within this project
# corpora within this project
...
@@ -101,16 +102,26 @@ def project(request, project_id):
...
@@ -101,16 +102,26 @@ def project(request, project_id):
sourcename2corpora
=
defaultdict
(
list
)
sourcename2corpora
=
defaultdict
(
list
)
for
corpus
in
corpora
:
for
corpus
in
corpora
:
# we only consider the first resource of the corpus to determine its type
# we only consider the first resource of the corpus to determine its type
resource
=
corpus
.
resources
()[
0
]
resources
=
corpus
.
resources
()
resource_type_name
=
RESOURCETYPES
[
resource
[
'type'
]][
'name'
]
if
len
(
resources
):
resource
=
resources
[
0
]
resource_type_name
=
RESOURCETYPES
[
resource
[
'type'
]][
'name'
]
else
:
print
(
"(WARNING) PROJECT view: no listed resource"
)
# add some data for the viewer
# add some data for the viewer
corpus
.
count
=
corpus
.
children
(
'DOCUMENT'
)
.
count
()
corpus
.
count
=
corpus
.
children
(
'DOCUMENT'
)
.
count
()
status
=
corpus
.
status
()
status
=
corpus
.
status
()
if
status
is
not
None
and
not
status
[
'complete'
]:
if
status
is
not
None
and
not
status
[
'complete'
]:
corpus
.
status_message
=
'(in progress:
%
s,
%
d complete)'
%
(
if
not
status
[
'error'
]:
status
[
'action'
]
.
replace
(
'_'
,
' '
),
corpus
.
status_message
=
'(in progress:
%
s,
%
d complete)'
%
(
status
[
'progress'
],
status
[
'action'
]
.
replace
(
'_'
,
' '
),
)
status
[
'progress'
],
)
else
:
corpus
.
status_message
=
'(aborted: "
%
s" after
%
i docs)'
%
(
status
[
'error'
][
-
1
],
status
[
'progress'
]
)
else
:
else
:
corpus
.
status_message
=
''
corpus
.
status_message
=
''
# add
# add
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment