Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
f7d58faf
Commit
f7d58faf
authored
Mar 15, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
[MERGE] merge of Romain and Mathieu branches.
parents
309e6c69
eec89097
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
1340 additions
and
34 deletions
+1340
-34
ngram_parsing_flow.png
doc/ngram_parsing_flow.png
+0
-0
constants.py
gargantext/constants.py
+33
-15
ngrams.py
gargantext/models/ngrams.py
+10
-2
cooccurrences.py
gargantext/util/analysis/cooccurrences.py
+207
-0
lists.py
gargantext/util/lists.py
+49
-9
__init__.py
gargantext/util/toolchain/__init__.py
+69
-2
list_main.py
gargantext/util/toolchain/list_main.py
+100
-0
list_map.py
gargantext/util/toolchain/list_map.py
+123
-0
list_stop.py
gargantext/util/toolchain/list_stop.py
+122
-0
metric_specificity.py
gargantext/util/toolchain/metric_specificity.py
+103
-0
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+171
-0
ngram_coocs.py
gargantext/util/toolchain/ngram_coocs.py
+193
-0
ngram_groups.py
gargantext/util/toolchain/ngram_groups.py
+133
-0
parsing.py
gargantext/util/toolchain/parsing.py
+8
-0
projects.py
gargantext/views/pages/projects.py
+19
-6
No files found.
doc/ngram_parsing_flow.png
0 → 100644
View file @
f7d58faf
52.5 KB
gargantext/constants.py
View file @
f7d58faf
...
...
@@ -9,29 +9,32 @@ LISTTYPES = {
'STOPLIST'
:
UnweightedList
,
'MAINLIST'
:
UnweightedList
,
'MAPLIST'
:
UnweightedList
,
'OCCURRENCES'
:
WeightedList
,
'SPECIFICITY'
:
WeightedList
,
'OCCURRENCES'
:
WeightedContextIndex
,
'COOCCURRENCES'
:
WeightedMatrix
,
'TFIDF-CORPUS'
:
WeightedContextIndex
,
'TFIDF-GLOBAL'
:
WeightedContextIndex
,
}
NODETYPES
=
[
None
,
# documents hierarchy
'USER'
,
'PROJECT'
,
'CORPUS'
,
'DOCUMENT'
,
'USER'
,
# 1
'PROJECT'
,
# 2
'CORPUS'
,
# 3
'DOCUMENT'
,
# 4
# lists
'STOPLIST'
,
'GROUPLIST'
,
'MAINLIST'
,
'MAPLIST'
,
'COOCCURRENCES'
,
'STOPLIST'
,
# 5
'GROUPLIST'
,
# 6
'MAINLIST'
,
# 7
'MAPLIST'
,
# 8
'COOCCURRENCES'
,
# 9
# scores
'OCCURRENCES'
,
'SPECIFICITY'
,
'CVALUE'
,
'TFIDF-CORPUS'
,
'TFIDF-GLOBAL'
,
'OCCURRENCES'
,
# 10
'SPECIFICITY'
,
# 11
'CVALUE'
,
# 12
'TFIDF-CORPUS'
,
# 13
'TFIDF-GLOBAL'
,
# 14
]
import
datetime
...
...
@@ -108,6 +111,21 @@ RESOURCETYPES = [
# },
]
# linguistic extraction parameters ---------------------------------------------
DEFAULT_TFIDF_CUTOFF_RATIO
=
.45
# MAINLIST maximum terms in %
DEFAULT_TFIDF_HARD_LIMIT
=
750
# MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD
=
5
# inclusive minimum for COOCS coefs
# (makes COOCS more sparse)
DEFAULT_MAPLIST_MAX
=
300
# MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO
=
.5
# part of monograms in MAPLIST
# (NB: used to be 0.005 !!)
# ------------------------------------------------------------------------------
# other parameters
# default number of docs POSTed to scrappers.views.py
...
...
gargantext/models/ngrams.py
View file @
f7d58faf
...
...
@@ -19,7 +19,7 @@ class NodeNgram(Base):
weight
=
Column
(
Float
)
class
NodeNodeNgram
(
Base
):
""" for instance for
tfidf:
""" for instance for
TFIDF
(
doc ::Node ,
corpus ::Node ,
...
...
@@ -37,8 +37,16 @@ class NodeNodeNgram(Base):
# (cf. www.postgresql.org/docs/9.4/static/datatype-numeric.html#DATATYPE-FLOAT)
class
NodeNgramNgram
(
Base
):
""" for instance for COOCCURRENCES and GROUPLIST
(
cooc_node/group_node ::Node ,
term_A ::Ngram ,
term_B ::Ngram ,
weight ::Float (real)
)
"""
__tablename__
=
'nodes_ngrams_ngrams'
node_id
=
Column
(
Integer
,
ForeignKey
(
Node
.
id
,
ondelete
=
'CASCADE'
),
primary_key
=
True
)
ngram1_id
=
Column
(
Integer
,
ForeignKey
(
Ngram
.
id
,
ondelete
=
'CASCADE'
),
primary_key
=
True
)
ngram2_id
=
Column
(
Integer
,
ForeignKey
(
Ngram
.
id
,
ondelete
=
'CASCADE'
),
primary_key
=
True
)
weight
=
Column
(
Float
)
weight
=
Column
(
Float
(
precision
=
24
))
# see comment for NodeNodeNgram.score
gargantext/util/analysis/cooccurrences.py
0 → 100644
View file @
f7d58faf
from
gargantext.util.db
import
*
from
gargantext.util.db_cache
import
*
from
gargantext.constants
import
*
from
gargantext.models.nodes
import
Node
from
gargantext.models.ngrams
import
Ngram
,
NodeNgram
,
NodeNgramNgram
,
\
NodeHyperdataNgram
,
NodeHyperdata
,
Hyperdata
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
from
sqlalchemy
import
literal_column
from
sqlalchemy.orm
import
aliased
from
sqlalchemy.sql
import
func
import
datetime
import
inspect
def
do_cooc
(
corpus
=
None
,
field1
=
'ngrams'
,
field2
=
'ngrams'
,
main_id
=
None
,
stop_id
=
None
,
group_id
=
None
,
cvalue_id
=
None
,
n_min
=
1
,
n_max
=
None
,
start
=
None
,
end
=
None
,
limit
=
1000
,
isMonopartite
=
True
,
hapax
=
3
,
session
=
None
):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
be merged before.
corpus :: Corpus
cvalue_id :: Int
main_id :: Int
stop_id :: Int
group_id :: Int
For the moment, start and end are simple, only year is implemented yet
start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
end :: TimeStamp
limit :: Int
'''
# TODO : add hyperdata here
# Security test
field1
,
field2
=
str
(
field1
),
str
(
field2
)
# Get node
node_cooc
=
session
.
query
(
Node
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
typename
==
"COOCCURRENCES"
)
.
first
()
if
node_cooc
==
None
:
node_cooc
=
Node
(
name
=
"Coccurrences node"
,
parent_id
=
corpus
.
id
,
user_id
=
corpus
.
user_id
,
typename
=
"COOCCURRENCES"
)
session
.
add
(
node_cooc
)
session
.
commit
()
# BEGIN
# Saving the parameters of the analysis in the Node JSONB hyperdata field
args
,
_
,
_
,
parameters
=
inspect
.
getargvalues
(
inspect
.
currentframe
())
# hyperdata = dict()
#
# for parameter in parameters.keys():
# if parameter != 'corpus' and parameter != 'node_cooc':
# hyperdata[parameter] = parameters[parameter]
#
# node_cooc.hyperdata = hyperdata
#
# session.add(node_cooc)
# session.commit()
# END
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_cooc
.
id
)
.
delete
()
session
.
commit
()
NodeNgramX
=
aliased
(
NodeNgram
)
cooc_score
=
func
.
count
(
NodeNgramX
.
node_id
)
.
label
(
'cooc_score'
)
#cooc_score = func.sqrt(func.sum(NodeNgramX.weight * NodeNgramY.weight)).label('cooc_score')
#print([n for n in test_query])
if
isMonopartite
:
NodeNgramY
=
aliased
(
NodeNgram
)
cooc_query
=
(
session
.
query
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
,
cooc_score
)
.
join
(
Node
,
Node
.
id
==
NodeNgramX
.
node_id
)
.
join
(
NodeNgramY
,
NodeNgramY
.
node_id
==
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
typename
==
"DOCUMENT"
)
)
else
:
NodeNgramY
=
aliased
(
NodeNgram
)
cooc_query
=
(
session
.
query
(
NodeHyperdataNgram
.
ngram_id
,
NodeNgramY
.
ngram_id
,
cooc_score
)
.
join
(
Node
,
Node
.
id
==
NodeHyperdataNgram
.
node_id
)
.
join
(
NodeNgramY
,
NodeNgramY
.
node_id
==
Node
.
id
)
.
join
(
Hyperdata
,
Hyperdata
.
id
==
NodeHyperdataNgram
.
hyperdata_id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
typename
==
"DOCUMENT"
)
.
filter
(
Hyperdata
.
name
==
field1
)
)
#print(cooc_query)
# Size of the ngrams between n_min and n_max
if
n_min
is
not
None
or
n_max
is
not
None
:
if
isMonopartite
:
NgramX
=
aliased
(
Ngram
)
cooc_query
=
cooc_query
.
join
(
NgramX
,
NgramX
.
id
==
NodeNgramX
.
ngram_id
)
NgramY
=
aliased
(
Ngram
)
cooc_query
=
(
cooc_query
.
join
(
NgramY
,
NgramY
.
id
==
NodeNgramY
.
ngram_id
)
)
if
n_min
is
not
None
:
cooc_query
=
(
cooc_query
.
filter
(
NgramY
.
n
>=
n_min
)
)
if
isMonopartite
:
cooc_query
=
cooc_query
.
filter
(
NgramX
.
n
>=
n_min
)
if
n_max
is
not
None
:
cooc_query
=
(
cooc_query
.
filter
(
NgramY
.
n
>=
n_min
)
)
if
isMonopartite
:
cooc_query
=
cooc_query
.
filter
(
NgramX
.
n
>=
n_min
)
# Cooc between the dates start and end
if
start
is
not
None
:
#date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
# TODO : more complexe date format here.
date_start
=
datetime
.
datetime
.
strptime
(
str
(
start
),
"
%
Y-
%
m-
%
d"
)
date_start_utc
=
date_start
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
Start
=
aliased
(
NodeHyperdata
)
StartFormat
=
aliased
(
Hyperdata
)
cooc_query
=
(
cooc_query
.
join
(
Start
,
Start
.
node_id
==
Node
.
id
)
.
join
(
StartFormat
,
StartFormat
.
id
==
Start
.
hyperdata_id
)
.
filter
(
StartFormat
.
name
==
'publication_date'
)
.
filter
(
Start
.
value_datetime
>=
date_start_utc
)
)
if
end
is
not
None
:
# TODO : more complexe date format here.
date_end
=
datetime
.
datetime
.
strptime
(
str
(
end
),
"
%
Y-
%
m-
%
d"
)
date_end_utc
=
date_end
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
End
=
aliased
(
NodeHyperdata
)
EndFormat
=
aliased
(
Hyperdata
)
cooc_query
=
(
cooc_query
.
join
(
End
,
End
.
node_id
==
Node
.
id
)
.
join
(
EndFormat
,
EndFormat
.
id
==
End
.
hyperdata_id
)
.
filter
(
EndFormat
.
name
==
'publication_date'
)
.
filter
(
End
.
value_datetime
<=
date_end_utc
)
)
if
isMonopartite
:
# Cooc is symetric, take only the main cooccurrences and cut at the limit
cooc_query
=
cooc_query
.
filter
(
NodeNgramX
.
ngram_id
<
NodeNgramY
.
ngram_id
)
cooc_query
=
cooc_query
.
having
(
cooc_score
>
hapax
)
if
isMonopartite
:
cooc_query
=
cooc_query
.
group_by
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
)
else
:
cooc_query
=
cooc_query
.
group_by
(
NodeHyperdataNgram
.
ngram_id
,
NodeNgramY
.
ngram_id
)
cooc_query
=
cooc_query
.
order_by
(
desc
(
'cooc_score'
))
# END of the query
matrix
=
LISTTYPES
[
"COOCCURRENCES"
](
cooc_query
)
#print(matrix)
if
isMonopartite
:
if
main_id
is
not
None
:
main_list
=
LISTTYPES
[
"MAINLIST"
](
main_id
)
if
stop_id
is
not
None
:
stop_list
=
LISTTYPES
[
"STOPLIST"
](
stop_id
)
if
group_id
is
not
None
:
group_list
=
LISTTYPES
[
"GROUPLIST"
](
group_id
)
if
main_id
is
not
None
and
stop_id
is
None
and
group_id
is
None
:
cooc
=
matrix
&
main_list
elif
main_id
is
not
None
and
stop_id
is
not
None
and
group_id
is
None
:
cooc
=
matrix
&
(
main_list
-
stop_list
)
elif
main_id
is
not
None
and
stop_id
is
not
None
and
group_id
is
not
None
:
print
(
"main_id is not None and stop_id is not None and group_id is not None"
)
cooc
=
matrix
&
(
main_list
*
group_list
-
stop_list
)
#cooc = matrix & (main_list - stop_list)
elif
main_id
is
not
None
and
stop_id
is
None
and
group_id
is
not
None
:
cooc
=
matrix
&
(
main_list
*
group_list
)
else
:
cooc
=
matrix
else
:
cooc
=
matrix
cooc
.
save
(
node_cooc
.
id
)
return
(
node_cooc
.
id
)
gargantext/util/lists.py
View file @
f7d58faf
...
...
@@ -2,7 +2,7 @@
"""
__all__
=
[
'Translations'
,
'WeightedMatrix'
,
'UnweightedList'
,
'WeightedList'
]
__all__
=
[
'Translations'
,
'WeightedMatrix'
,
'UnweightedList'
,
'WeightedList'
,
'WeightedContextIndex'
]
from
gargantext.util.db
import
session
,
bulk_insert
...
...
@@ -70,8 +70,10 @@ class _BaseClass:
class
Translations
(
_BaseClass
):
def
__init__
(
self
,
source
=
None
):
def
__init__
(
self
,
source
=
None
,
just_items
=
False
):
self
.
items
=
defaultdict
(
int
)
# TODO lazyinit for groups
# (not necessary for save)
self
.
groups
=
defaultdict
(
set
)
if
source
is
None
:
return
...
...
@@ -83,15 +85,35 @@ class Translations(_BaseClass):
.
filter
(
NodeNgramNgram
.
node_id
==
source
)
)
self
.
items
.
update
(
query
)
for
key
,
value
in
self
.
items
.
items
():
self
.
groups
[
value
]
.
add
(
key
)
if
not
just_items
:
for
key
,
value
in
self
.
items
.
items
():
self
.
groups
[
value
]
.
add
(
key
)
elif
isinstance
(
source
,
Translations
):
self
.
items
.
update
(
source
.
items
)
self
.
groups
.
update
(
source
.
groups
)
if
not
just_items
:
self
.
groups
.
update
(
source
.
groups
)
elif
hasattr
(
source
,
'__iter__'
):
# not very intuitive with update here:
# /!\ source must be "reversed" (like self.items)
# bad exemple
# In > couples = [(1, 2), (1, 3)]
# In > tlko = Translations(couples)
# Out> Translations {1: 3}
# In > tlko.save()
# DB-- 3 -> 1
# good exemple
# In > reversed_couples = [(2, 1), (3, 1)]
# In > tlok = Translations(reversed_couples)
# Out> Translations {2: 1, 3: 1}
# In > tlok.save()
# DB-- 1 -> 2
# DB-- 1 -> 3
self
.
items
.
update
(
source
)
for
key
,
value
in
self
.
items
.
items
():
self
.
groups
[
value
]
.
add
(
key
)
if
not
just_items
:
for
key
,
value
in
self
.
items
.
items
():
self
.
groups
[
value
]
.
add
(
key
)
else
:
raise
TypeError
...
...
@@ -138,11 +160,29 @@ class Translations(_BaseClass):
# insert new data
bulk_insert
(
NodeNgramNgram
,
(
'node_id'
,
'ngram2_id'
,
'ngram1_id'
,
'
score
'
),
(
'node_id'
,
'ngram2_id'
,
'ngram1_id'
,
'
weight
'
),
((
node_id
,
key
,
value
,
1.0
)
for
key
,
value
in
self
.
items
.
items
())
)
class
WeightedContextIndex
(
_BaseClass
):
"""
associated model : NodeNodeNgram
associated columns : node1_id | node2_id | ngram_id | score (float)
Tensor representing a contextual index or registry
(matrix of weighted ngrams *per* doc *per* context)
Exemple : tfidf by corpus
"""
def
__init__
(
self
,
source
=
None
):
self
.
items
=
defaultdict
(
float
)
# £TODO
class
WeightedMatrix
(
_BaseClass
):
def
__init__
(
self
,
source
=
None
):
...
...
@@ -184,7 +224,7 @@ class WeightedMatrix(_BaseClass):
# insert new data
bulk_insert
(
NodeNgramNgram
,
(
'node_id'
,
'ngram1_id'
,
'ngram2_id'
,
'
score
'
),
(
'node_id'
,
'ngram1_id'
,
'ngram2_id'
,
'
weight
'
),
((
node_id
,
key1
,
key2
,
value
)
for
key1
,
key2
,
value
in
self
)
)
...
...
gargantext/util/toolchain/__init__.py
View file @
f7d58faf
from
.parsing
import
parse
from
.parsing
import
parse
from
.ngrams_extraction
import
extract_ngrams
from
.hyperdata_indexing
import
index_hyperdata
# in usual run order
from
.list_stop
import
do_stoplist
from
.metric_tfidf
import
compute_occs
,
compute_tfidf
from
.list_main
import
do_mainlist
from
.ngram_coocs
import
compute_coocs
from
.metric_specificity
import
compute_specificity
from
.list_map
import
do_maplist
# TEST
from
.ngram_groups
import
compute_groups
from
gargantext.util.db
import
session
from
gargantext.models
import
Node
from
gargantext.models
import
Node
from
datetime
import
datetime
def
parse_extract
(
corpus
):
# retrieve corpus from database from id
...
...
@@ -18,6 +27,12 @@ def parse_extract(corpus):
# apply actions
print
(
'CORPUS #
%
d'
%
(
corpus
.
id
))
parse
(
corpus
)
# was there an error in the process ?
if
corpus
.
status
()[
'error'
]:
print
(
"ERROR: aborting parse_extract for corpus #
%
i"
%
corpus_id
)
return
None
print
(
'CORPUS #
%
d: parsed'
%
(
corpus
.
id
))
extract_ngrams
(
corpus
)
print
(
'CORPUS #
%
d: extracted ngrams'
%
(
corpus
.
id
))
...
...
@@ -38,3 +53,55 @@ def parse_extract_indexhyperdata(corpus):
print
(
'CORPUS #
%
d: extracted ngrams'
%
(
corpus
.
id
))
index_hyperdata
(
corpus
)
print
(
'CORPUS #
%
d: indexed hyperdata'
%
(
corpus
.
id
))
# -------------------------------
# temporary ngram lists workflow
# -------------------------------
print
(
'CORPUS #
%
d: [
%
s] starting ngram lists computation'
%
(
corpus
.
id
,
t
()))
# -> stoplist: filter + write (to Node and NodeNgram)
stop_id
=
do_stoplist
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new stoplist node #
%
i'
%
(
corpus
.
id
,
t
(),
stop_id
))
# -> write groups to Node and NodeNgramNgram
group_id
=
compute_groups
(
corpus
,
stoplist_id
=
None
)
print
(
'CORPUS #
%
d: [
%
s] new grouplist node #
%
i'
%
(
corpus
.
id
,
t
(),
group_id
))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id
=
compute_occs
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new occs node #
%
i'
%
(
corpus
.
id
,
t
(),
occ_id
))
# ------------
# -> write local tfidf to Node and NodeNodeNgram
ltfidf_id
=
compute_tfidf
(
corpus
,
scope
=
"local"
)
print
(
'CORPUS #
%
d: [
%
s] new localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf_id
))
# -> write global tfidf to Node and NodeNodeNgram
gtfidf_id
=
compute_tfidf
(
corpus
,
scope
=
"global"
)
print
(
'CORPUS #
%
d: [
%
s] new globaltfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
gtfidf_id
))
# -> mainlist: filter + write (to Node and NodeNgram)
mainlist_id
=
do_mainlist
(
corpus
,
tfidf_id
=
gtfidf_id
,
stoplist_id
=
stop_id
)
print
(
'CORPUS #
%
d: [
%
s] new mainlist node #
%
i'
%
(
corpus
.
id
,
t
(),
mainlist_id
))
# ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id
=
compute_coocs
(
corpus
,
mainlist_id
=
mainlist_id
)
print
(
'CORPUS #
%
d: [
%
s] new coocs node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc_id
))
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id
=
compute_specificity
(
corpus
,
cooc_id
=
cooc_id
)
print
(
'CORPUS #
%
d: [
%
s] new specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
spec_id
))
# ?? maplist: compute + write (to Node and NodeNgram)
map_id
=
do_maplist
(
corpus
,
mainlist_id
=
mainlist_id
,
specificity_id
=
spec_id
,
grouplist_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new maplist node #
%
i'
%
(
corpus
.
id
,
t
(),
map_id
))
def
t
():
return
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d_
%
H:
%
M:
%
S"
)
gargantext/util/toolchain/list_main.py
0 → 100644
View file @
f7d58faf
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
from
gargantext.util.db
import
session
from
gargantext.util.lists
import
UnweightedList
from
sqlalchemy
import
desc
from
gargantext.constants
import
DEFAULT_TFIDF_CUTOFF_RATIO
,
\
DEFAULT_TFIDF_HARD_LIMIT
def
do_mainlist
(
corpus
,
overwrite_id
=
None
,
tfidf_id
=
None
,
stoplist_id
=
None
,
hard_limit
=
DEFAULT_TFIDF_HARD_LIMIT
,
ratio_limit
=
DEFAULT_TFIDF_CUTOFF_RATIO
):
"""
Select top n terms according to a global tfidf ranking and stoplist filter.
The number of selected terms will be:
min(hard_limit, number_of_terms * ratio_limit)
NB : We use a global tfidf node where the values are global but the ngrams
are already selected (== only within this corpus documents).
TO DISCUSS: allow influence of the local tfidf scores too
Parameters:
- the corpus itself
- a tfidf score for ranking the ngrams
- a stoplist for filtering some ngrams
- overwrite_id: optional id of a pre-existing MAINLIST node for this corpus
(the Node and its previous NodeNgram rows will be replaced)
+ 2 limits to set the amount of picked terms:
- ratio_limit ∈ [0,1]: a ratio relative to the number of distinct ngrams
(default: 0.55)
- hard_limit: an absolute max value
(default: 1000)
"""
# retrieve helper nodes if not provided
if
not
tfidf_id
:
tfidf_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"TFIDF-GLOBAL"
,
Node
.
parent_id
==
corpus
.
id
)
.
first
()
if
not
tfidf_id
:
raise
ValueError
(
"MAINLIST: TFIDF node needed for mainlist creation"
)
if
not
stoplist_id
:
stoplist_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"STOPLIST"
,
Node
.
parent_id
==
corpus
.
id
)
.
first
()
if
not
stoplist_id
:
raise
ValueError
(
"MAINLIST: STOPLIST node needed for mainlist creation"
)
# the ngrams we don't want
# NOTE: keep sure we do this only once during the ngram initial workflow
stopterms_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
stoplist_id
)
.
subquery
()
)
# tfidf-ranked query
ordered_filtered_tfidf
=
(
session
.
query
(
NodeNodeNgram
.
ngram_id
)
.
filter
(
NodeNodeNgram
.
node1_id
==
tfidf_id
)
.
filter
(
~
NodeNodeNgram
.
ngram_id
.
in_
(
stopterms_subquery
))
.
order_by
(
desc
(
NodeNodeNgram
.
score
))
)
# total count
nb_ngrams
=
ordered_filtered_tfidf
.
count
()
# apply ratio to find smallest limit
our_limit
=
min
(
hard_limit
,
round
(
nb_ngrams
*
ratio_limit
))
print
(
"MAINLIST: keeping
%
i ngrams out of
%
i"
%
(
our_limit
,
nb_ngrams
))
# DB retrieve up to limit => MAINLIST
top_ngrams_ids
=
ordered_filtered_tfidf
.
limit
(
our_limit
)
.
all
()
if
overwrite_id
:
# overwrite pre-existing id
the_id
=
overwrite_id
# mainlist = cache.Node[overwrite_id]
else
:
# now create the new MAINLIST node
mainlist
=
corpus
.
add_child
(
typename
=
"MAINLIST"
,
name
=
"Mainlist (in:
%
s)"
%
corpus
.
id
)
session
.
add
(
mainlist
)
session
.
commit
()
the_id
=
mainlist
.
id
# create UnweightedList object and save (=> new NodeNgram rows)
UnweightedList
(
top_ngrams_ids
)
.
save
(
the_id
)
return
the_id
gargantext/util/toolchain/list_map.py
0 → 100644
View file @
f7d58faf
"""
Selects a subset of corpus ngrams to use in the graph map.
"""
from
gargantext.models.ngrams
import
Node
,
Ngram
,
NodeNgram
,
\
NodeNgramNgram
,
NodeNodeNgram
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.util.db_cache
import
cache
from
gargantext.util.lists
import
UnweightedList
from
sqlalchemy
import
desc
from
gargantext.constants
import
DEFAULT_MAPLIST_MAX
,
\
DEFAULT_MAPLIST_MONOGRAMS_RATIO
def
do_maplist
(
corpus
,
overwrite_id
=
None
,
mainlist_id
=
None
,
specificity_id
=
None
,
grouplist_id
=
None
,
limit
=
DEFAULT_MAPLIST_MAX
,
monograms_part
=
DEFAULT_MAPLIST_MONOGRAMS_RATIO
):
'''
According to Specificities and mainlist
Parameters:
- mainlist_id (starting point, already cleaned of stoplist terms)
- specificity_id (ranking factor)
- grouplist_id (filtering grouped ones)
- overwrite_id: optional if preexisting MAPLIST node to overwrite
+ 2 constants to modulate the terms choice
- limit for the amount of picked terms
- monograms_part: a ratio of terms with only one lexical unit to keep
'''
if
not
(
mainlist_id
and
specificity_id
and
grouplist_id
):
raise
ValueError
(
"Please provide mainlist_id, specificity_id and grouplist_id"
)
monograms_limit
=
round
(
limit
*
monograms_part
)
multigrams_limit
=
limit
-
monograms_limit
print
(
"MAPLIST: monograms_limit ="
,
monograms_limit
)
print
(
"MAPLIST: multigrams_limit = "
,
multigrams_limit
)
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
mainterms_subquery
=
(
session
# we want only terms within mainlist
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
mainlist_id
)
.
subquery
()
)
primary_groupterms_subquery
=
(
session
# we want only primary terms (ngram1)
.
query
(
NodeNgramNgram
.
ngram1_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
grouplist_id
)
.
subquery
()
)
ScoreSpec
=
aliased
(
NodeNgram
)
# specificity-ranked
query
=
(
session
.
query
(
ScoreSpec
.
ngram_id
)
.
join
(
Ngram
,
Ngram
.
id
==
ScoreSpec
.
ngram_id
)
.
filter
(
ScoreSpec
.
node_id
==
specificity_id
)
.
filter
(
ScoreSpec
.
ngram_id
.
in_
(
mainterms_subquery
))
.
filter
(
ScoreSpec
.
ngram_id
.
in_
(
primary_groupterms_subquery
))
)
# TODO: move these 2 pools up to mainlist selection
top_monograms
=
(
query
.
filter
(
Ngram
.
n
==
1
)
.
order_by
(
desc
(
ScoreSpec
.
weight
))
.
limit
(
monograms_limit
)
.
all
()
)
top_multigrams
=
(
query
.
filter
(
Ngram
.
n
>=
2
)
.
order_by
(
desc
(
ScoreSpec
.
weight
))
.
limit
(
multigrams_limit
)
.
all
()
)
print
(
"MAPLIST: top_monograms ="
,
len
(
top_monograms
))
print
(
"MAPLIST: top_multigrams = "
,
len
(
top_multigrams
))
# NEW MAPLIST NODE
# -----------------
# saving the parameters of the analysis in the Node JSON
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
'limit'
:
limit
,
'monograms_part'
:
monograms_part
}
if
overwrite_id
:
# overwrite pre-existing node
the_maplist
=
cache
.
Node
[
overwrite_id
]
the_maplist
.
hyperdata
=
new_hyperdata
the_maplist
.
save_hyperdata
()
session
.
commit
()
the_id
=
overwrite_id
else
:
# create a new maplist node
the_maplist
=
corpus
.
add_child
(
name
=
"Maplist (in
%
i)"
%
corpus
.
id
,
typename
=
"MAPLIST"
,
hyperdata
=
new_hyperdata
)
session
.
add
(
the_maplist
)
session
.
commit
()
the_id
=
the_maplist
.
id
# create UnweightedList object and save (=> new NodeNgram rows)
datalist
=
UnweightedList
(
[
res
.
ngram_id
for
res
in
top_monograms
+
top_multigrams
]
)
# save
datalist
.
save
(
the_id
)
# dbg.show('MapList computed')
return
the_id
gargantext/util/toolchain/list_stop.py
0 → 100644
View file @
f7d58faf
"""
Creates a filtering list for corpus ngrams.
(implementation: regexp + "master" stoplist)
"""
from
gargantext.models
import
User
,
Node
,
Ngram
,
NodeNgram
from
gargantext.util.db
import
session
,
func
from
gargantext.constants
import
LISTTYPES
from
re
import
compile
from
sqlalchemy
import
desc
def
is_stop_word
(
ngram
,
stop_words
=
None
):
'''
ngram :: (Int, String) => (ngram_id, ngram_terms)
stop_words :: Set of String
(to avoid SQL query each time is_stop_word is invoked, get in as parameter)
'''
word
=
ngram
[
1
]
if
word
in
stop_words
:
return
(
True
)
compiled_regexes
=
[]
# to compile them only once
for
regex
in
[
"^.{1,2}$"
,
"(.*)
\
d(.*)"
# , "(.*)(\.)(.*)" trop fort (enlève les sigles !)
,
"(.*)(
\
,)(.*)"
,
"(.*)(< ?/?p ?>)(.*)"
# marques de paragraphes
,
"(.*)(study)(.*)"
,
"(.*)
\b
(xx|xi|xv)
\b
(.*)"
,
"(.*)(result)(.*)"
,
"(.*)(année|nombre|moitié)(.*)"
,
"(.*)(temps)(.*)"
,
"(.*)(
%
)(.*)"
,
"(.*)(
\
{)(.*)"
,
"(.*)(terme)(.*)"
,
"(.*)(différent)(.*)"
,
"(.*)(travers)(.*)"
,
"(.*)(:|
\
|)(.*)"
]
:
compiled_regexes
.
append
(
compile
(
regex
))
for
format_regex
in
compiled_regexes
:
if
format_regex
.
match
(
word
):
# print("STOPLIST += '%s' (regex: %s)" % (word, format_regex.pattern))
return
(
True
)
return
False
def
create_gargantua_resources
():
gargantua_id
=
session
.
query
(
User
.
id
)
.
filter
(
User
.
username
==
"gargantua"
)
.
first
()
project
=
Node
(
name
=
"Resources"
,
user_id
=
gargantua_id
,
typename
=
"PROJECT"
)
stopList
=
Node
(
name
=
"STOPLIST"
,
parent_id
=
project
.
id
,
user_id
=
gargantua_id
,
typename
=
"STOPLIST"
)
session
.
add
(
project
)
session
.
add
(
stopList
)
session
.
commit
()
def
do_stoplist
(
corpus
,
overwrite_id
=
None
):
'''
Create list of stop words.
TODO do a function to get all stop words with social scores
Parameters:
- overwrite_id: optional preexisting STOPLIST node to overwrite
'''
# Get preexisting StopList if provided in overwrite_id param
if
overwrite_id
:
stoplist_id
=
overwrite_id
# At this step of development, a new StopList should be created
else
:
stoplist
=
corpus
.
add_child
(
name
=
"Stoplist (in:
%
s)"
%
corpus
.
id
,
typename
=
"STOPLIST"
)
session
.
add
(
stoplist
)
session
.
commit
()
stoplist_id
=
stoplist
.
id
# Get common resources, all common StopWords on the platform
## First get the id of the StopList of Gargantua super user
gargantua_id
=
session
.
query
(
User
.
id
)
.
filter
(
User
.
username
==
"gargantua"
)
.
first
()
rootStopList_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
user_id
==
gargantua_id
,
Node
.
typename
==
"STOPLIST"
)
.
first
()
## Then get all the stop words
## stop_words :: [String]
stop_words
=
(
session
.
query
(
Ngram
.
terms
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
filter
(
NodeNgram
.
node_id
==
rootStopList_id
)
.
all
()
)
# print([n for n in stop_words])
## Get the ngrams
## ngrams :: [(Int, String, Int)]
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
typename
==
"DOCUMENT"
)
.
group_by
(
Ngram
.
id
)
#.limit(limit)
.
all
()
)
ngrams_to_stop
=
filter
(
lambda
x
:
is_stop_word
(
x
,
stop_words
=
stop_words
),
ngrams
)
# print([n for n in ngrams_to_stop])
stop
=
LISTTYPES
[
"STOPLIST"
]({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
# stop = LISTTYPES["STOPLIST"]([n[0] for n in ngrams_to_stop])
stop
.
save
(
stoplist_id
)
return
stoplist_id
gargantext/util/toolchain/metric_specificity.py
0 → 100644
View file @
f7d58faf
"""
Computes a specificity metric from the ngram cooccurrence matrix.
+ SAVE => WeightedList => NodeNgram
"""
from
gargantext.models
import
Node
,
Ngram
,
NodeNgram
,
NodeNgramNgram
from
gargantext.util.db
import
session
,
aliased
,
func
,
bulk_insert
from
gargantext.util.lists
import
WeightedList
from
collections
import
defaultdict
from
pandas
import
DataFrame
def
compute_specificity
(
corpus
,
cooc_id
=
None
,
overwrite_id
=
None
):
'''
Compute the specificity, simple calculus.
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- overwrite_id: optional preexisting specificity node to overwrite
'''
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
)
# no filtering: new choice cooc already filtered on tfidf before creation
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
# £TODO re-rename weight => score
for
cooccurrence
in
cooccurrences
:
matrix
[
cooccurrence
.
ngram1_id
][
cooccurrence
.
ngram2_id
]
=
cooccurrence
.
weight
matrix
[
cooccurrence
.
ngram2_id
][
cooccurrence
.
ngram1_id
]
=
cooccurrence
.
weight
nb_ngrams
=
len
(
matrix
)
print
(
"SPECIFICITY: computing on
%
i ngrams"
%
nb_ngrams
)
d
=
DataFrame
(
matrix
)
.
fillna
(
0
)
# proba (x/y) ( <= on divise chaque colonne par son total)
d
=
d
/
d
.
sum
(
axis
=
0
)
# d:Matrix => v: Vector (len = nb_ngrams)
v
=
d
.
sum
(
axis
=
1
)
## d ##
#######
# Grenelle biodiversité kilomètres site élus île
# Grenelle 0 0 4 0 0 0
# biodiversité 0 0 0 0 4 0
# kilomètres 4 0 0 0 4 0
# site 0 0 0 0 4 6
# élus 0 4 4 4 0 0
# île 0 0 0 6 0 0
## d.sum(axis=1) ##
###################
# Grenelle 4
# biodiversité 4
# kilomètres 8
# site 10
# élus 12
# île 6
# résultat temporaire
# -------------------
# pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
# (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
# TODO analyser la cohérence math ET sem de cet indicateur
v
.
sort_values
(
inplace
=
True
)
# [ ('biodiversité' , 0.333 ),
# ('Grenelle' , 0.5 ),
# ('île' , 0.599 ),
# ('kilomètres' , 1.333 ),
# ('site' , 1.333 ),
# ('élus' , 1.899 ) ]
# ----------------
# specificity node
if
overwrite_id
:
# overwrite pre-existing id
the_id
=
overwrite_id
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
node1_id
==
the_id
)
.
delete
()
session
.
commit
()
else
:
specnode
=
corpus
.
add_child
(
typename
=
"SPECIFICITY"
,
name
=
"Specif (in:
%
s)"
%
corpus
.
id
)
session
.
add
(
specnode
)
session
.
commit
()
the_id
=
specnode
.
id
# print(v)
data
=
WeightedList
(
zip
(
v
.
index
.
tolist
()
,
v
.
values
.
tolist
()
)
)
data
.
save
(
the_id
)
return
(
the_id
)
gargantext/util/toolchain/metric_tfidf.py
0 → 100644
View file @
f7d58faf
"""
Computes ngram scores with 3 ranking functions:
- the simple sum of occurrences inside the corpus
- the tfidf inside the corpus
- the global tfidf for all corpora having same source
FIXME: "having the same source" means we need to select inside hyperdata
with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ...
"""
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
from
gargantext.util.db
import
session
,
bulk_insert
,
func
# = sqlalchemy.func like sum() or count()
from
sqlalchemy
import
text
# for query from raw SQL statement
from
math
import
log
# £TODO
# from gargantext.util.lists import WeightedContextIndex
def
compute_occs
(
corpus
,
overwrite_id
=
None
):
"""
Calculates sum of occs per ngram within corpus
(used as info in the ngrams table view)
? optimize ? OCCS here could be calculated simultaneously within TFIDF-CORPUS loop
Parameters:
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
# 1) all the doc_ids of our corpus (scope of counts for filter)
# slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
)
# 2) our sums per ngram_id
occ_sums
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
func
.
sum
(
NodeNgram
.
weight
)
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
docids_subquery
))
.
group_by
(
NodeNgram
.
ngram_id
)
.
all
()
)
# example result = [(1970, 1.0), (2024, 2.0), (259, 2.0), (302, 1.0), ... ]
# ^^^^ ^^^
# ngram_id sum_wei
if
overwrite_id
:
# overwrite pre-existing id
the_id
=
overwrite_id
# occnode = cache.Node[overwrite_id]
else
:
# create the new OCCURRENCES node
occnode
=
corpus
.
add_child
(
typename
=
"OCCURRENCES"
,
name
=
"occ_sums (in:
%
s)"
%
corpus
.
id
)
session
.
add
(
occnode
)
session
.
commit
()
the_id
=
occnode
.
id
# reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf)
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert
(
NodeNodeNgram
,
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
((
the_id
,
corpus
.
id
,
res
[
0
],
res
[
1
])
for
res
in
occ_sums
)
)
return
the_id
def
compute_tfidf
(
corpus
,
scope
=
"local"
,
overwrite_id
=
None
):
"""
Calculates tfidf within the current corpus
Parameters:
- the corpus itself
- scope: {"local" or "global"}
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
# local <=> within this corpus
if
scope
==
"local"
:
# All docs of this corpus
docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
)
# global <=> within all corpora of this source
elif
scope
==
"global"
:
this_source_type
=
corpus
.
resources
()[
0
][
'type'
]
# all corpora with the same source type
# (we need raw SQL query for postgres JSON operators) (TODO test speed)
same_source_corpora_query
=
(
session
.
query
(
Node
.
id
)
.
from_statement
(
text
(
"""
SELECT id FROM nodes
WHERE hyperdata->'resources' @> '[{
\"
type
\"
\
:
%
s}]'
"""
%
this_source_type
))
)
# All docs **in all corpora of the same source**
docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
.
in_
(
same_source_corpora_query
))
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
)
# N
total_docs
=
session
.
query
(
docids_subquery
)
.
count
()
# or perhaps at least do the occurrences right now at the same time
tf_nd
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
func
.
sum
(
NodeNgram
.
weight
),
# tf: same as occnode
func
.
count
(
NodeNgram
.
node_id
)
# nd: n docs with term
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
docids_subquery
))
.
group_by
(
NodeNgram
.
ngram_id
)
.
all
()
)
# -------------------------------------------------
tfidfs
=
{}
log_tot_docs
=
log
(
total_docs
)
for
(
ngram_id
,
tf
,
nd
)
in
tf_nd
:
# tfidfs[ngram_id] = tf * log(total_docs/nd)
tfidfs
[
ngram_id
]
=
tf
*
(
log_tot_docs
-
log
(
nd
))
# -------------------------------------------------
if
overwrite_id
:
the_id
=
overwrite_id
else
:
# create the new TFIDF-XXXX node
tfidf_nd
=
corpus
.
add_child
()
if
scope
==
"local"
:
tfidf_nd
.
typename
=
"TFIDF-CORPUS"
tfidf_nd
.
name
=
"tfidf-c (in:
%
s)"
%
corpus
.
id
elif
scope
==
"global"
:
tfidf_nd
.
typename
=
"TFIDF-GLOBAL"
tfidf_nd
.
name
=
"tfidf-g (in type:
%
s)"
%
this_source_type
session
.
add
(
tfidf_nd
)
session
.
commit
()
the_id
=
tfidf_nd
.
id
# reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert
(
NodeNodeNgram
,
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
((
the_id
,
corpus
.
id
,
ng
,
tfidfs
[
ng
])
for
ng
in
tfidfs
)
)
return
the_id
gargantext/util/toolchain/ngram_coocs.py
0 → 100644
View file @
f7d58faf
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNgramNgram
from
gargantext.util.lists
import
WeightedMatrix
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.util.db_cache
import
cache
from
gargantext.constants
import
DEFAULT_COOC_THRESHOLD
def
compute_coocs
(
corpus
,
overwrite_id
=
None
,
threshold
=
DEFAULT_COOC_THRESHOLD
,
mainlist_id
=
None
,
stoplist_id
=
None
,
symmetry_filter
=
True
):
"""
Count how often some extracted terms appear
together in a small context (document)
throughout a larger context (corpus).
[NodeNgram] [NodeNgramNgram]
node_id | ngram_id | weight ngram1_id | ngram2_id | score |
--------+----------+-------- ----------+-----------+-------+
MYDOCA | 487 | 1 => 487 | 294 | 2 |
MYDOCA | 294 | 3
MYDOCB | 487 | 1
MYDOCB | 294 | 4
Fill that info in DB:
- a *new* COOCCURRENCES node
- and all corresponding NodeNgramNgram rows
worse case complexity ~ O(N²/2) with N = number of ngrams
If a mainlist is provided, we filter doc ngrams to those also in the list.
Parameters:
- the corpus node
- overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
(all hyperdata and previous NodeNgramNgram rows will be replaced)
- threshold: on output cooc count (previously called hapax)
- mainlist_id: mainlist to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is provided)
(deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
- isMonopartite: ?? used a nodes_hyperdata_ngrams table ???
basic idea for one doc
======================
each pair of ngrams sharing same doc (node_id)
SELEC idx1.ngram_id, idx2.ngram_id
FROM nodes_ngrams AS idx1, nodes_ngrams AS idx2
---------------------------------
WHERE idx1.node_id = idx2.node_id <== that's cooc
---------------------------------
AND idx1.ngram_id <> idx2.ngram_id
AND idx1.node_id = MY_DOC ;
on entire corpus
=================
coocs for each doc :
- each given pair like (termA, termB) will likely appear several times
=> we do GROUP BY (x1.ngram_id, x2.ngram_id)
- we count unique appearances of the pair (cooc)
"""
# - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO start, end : filter on document date
# - TODO weighted: if False normal cooc to be saved as result
# if True weighted cooc (experimental)
# /!\ big combinatorial complexity /!\
# pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1
# 1.859.408 lignes pour la requête cooc simple
# 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
# docs of our corpus
docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
)
# 2 x the occurrence index table
x1
=
aliased
(
NodeNgram
)
x2
=
aliased
(
NodeNgram
)
# cooccurrences columns definition
ucooc
=
func
.
count
(
x1
.
ngram_id
)
.
label
(
"ucooc"
)
# 1) MAIN DB QUERY
coocs_query
=
(
session
.
query
(
x1
.
ngram_id
,
x2
.
ngram_id
,
ucooc
)
.
filter
(
x1
.
node_id
==
x2
.
node_id
)
# <- by definition of cooc
.
filter
(
x1
.
ngram_id
!=
x2
.
ngram_id
)
# <- b/c not with itself
.
filter
(
x1
.
node_id
.
in_
(
docids_subquery
))
# <- b/c within corpus
.
group_by
(
x1
.
ngram_id
,
x2
.
ngram_id
)
)
# 2) INPUT FILTERS (reduce N before O(N²))
if
mainlist_id
:
main_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
mainlist_id
)
.
subquery
()
)
coocs_query
=
(
coocs_query
.
filter
(
x1
.
ngram_id
.
in_
(
main_subquery
)
)
.
filter
(
x2
.
ngram_id
.
in_
(
main_subquery
)
)
)
if
stoplist_id
:
stop_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
stoplist_id
)
.
subquery
()
)
coocs_query
=
(
coocs_query
.
filter
(
~
x1
.
ngram_id
.
in_
(
stop_subquery
)
)
.
filter
(
~
x2
.
ngram_id
.
in_
(
stop_subquery
)
)
)
if
symmetry_filter
:
# 1 filtre tenant en compte de la symétrie
# -> réduit le travail de moitié !!
# -> mais empêchera l'accès direct aux cooccurrences de x2
# -> seront éparpillées: notées dans les x1 qui ont précédé x2
# -> récupération sera plus couteuse via des requêtes OR comme:
# WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
coocs_query
=
coocs_query
.
filter
(
x1
.
ngram_id
<
x2
.
ngram_id
)
# ------------
# 2 filtres amont possibles pour réduire combinatoire
# - par exemple 929k lignes => 35k lignes
# - ici sur weight mais dégrade les résultats
# => imaginable sur une autre métrique (cvalue ou tfidf?)
# coocs_query = coocs_query.filter(x1.weight > 1)
# coocs_query = coocs_query.filter(x2.weight > 1)
# ------------
# 3) OUTPUT FILTERS
# ------------------
# threshold
# £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity
coocs_query
=
coocs_query
.
having
(
ucooc
>=
threshold
)
# 4) EXECUTE QUERY
# ----------------
# => storage in our matrix structure
matrix
=
WeightedMatrix
(
coocs_query
.
all
())
# fyi
# shape_0 = len({pair[0] for pair in matrix.items})
# shape_1 = len({pair[1] for pair in matrix.items})
# print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
# 5) SAVE
# --------
# saving the parameters of the analysis in the Node JSON
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
'threshold'
:
threshold
}
if
overwrite_id
:
# overwrite pre-existing id
the_cooc
=
cache
.
Node
[
overwrite_id
]
the_cooc
.
hyperdata
=
new_hyperdata
the_cooc
.
save_hyperdata
()
session
.
commit
()
the_id
=
overwrite_id
else
:
# create the new cooc node
the_cooc
=
corpus
.
add_child
(
typename
=
"COOCCURRENCES"
,
name
=
"Coocs (in:
%
s)"
%
corpus
.
name
[
0
:
10
],
hyperdata
=
new_hyperdata
,
)
session
.
add
(
the_cooc
)
session
.
commit
()
the_id
=
the_cooc
.
id
# ==> save all NodeNgramNgram with link to new cooc node id
matrix
.
save
(
the_id
)
return
the_id
gargantext/util/toolchain/ngram_groups.py
0 → 100644
View file @
f7d58faf
"""
For initial ngram groups via stemming
Exemple:
- groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...}
- groups['post'] = {'poste':3, 'poster':5, 'postés':2...}
"""
from
gargantext.models
import
Node
,
NodeNgramNgram
from
gargantext.util.db
import
session
from
gargantext.util.lists
import
Translations
# to convert fr => french :/
from
gargantext.util.languages
import
languages
from
re
import
split
as
resplit
from
collections
import
defaultdict
,
Counter
from
nltk.stem.snowball
import
SnowballStemmer
def
prepare_stemmers
(
corpus
):
"""
Returns *several* stemmers (one for each language in the corpus)
(as a dict of stemmers with key = language_iso2)
"""
stemmers_by_lg
=
{
# always get a generic stemmer in case language code unknown
'__unknown__'
:
SnowballStemmer
(
"english"
)
}
for
lgiso2
in
corpus
.
hyperdata
[
'languages'
]
.
keys
():
lgname
=
languages
[
lgiso2
]
.
name
.
lower
()
stemmers_by_lg
[
lgiso2
]
=
SnowballStemmer
(
lgname
)
return
stemmers_by_lg
def
compute_groups
(
corpus
,
stoplist_id
=
None
,
overwrite_id
=
None
):
"""
1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma
2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2)
"""
stop_ngrams_ids
=
{}
# we will need the ngrams of the stoplist to filter
if
stoplist_id
is
not
None
:
for
id
in
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
stoplist_id
)
.
all
():
stop_ngrams_ids
[
id
[
0
]]
=
True
# 1) compute stems/lemmas
# and group if same stem/lemma
stemmers
=
prepare_stemmers
(
corpus
)
# todo dict {lg => {ngrams_todo} }
todo_ngrams_per_lg
=
defaultdict
(
set
)
# res dict { commonstem: {ngram_1:freq_1 ,ngram_2:freq_2 ,ngram_3:freq_3} }
my_groups
=
defaultdict
(
Counter
)
# preloop per doc to sort ngrams by language
for
doc
in
corpus
.
children
():
if
(
'language_iso2'
in
doc
.
hyperdata
):
lgid
=
doc
.
hyperdata
[
'language_iso2'
]
else
:
lgid
=
"__unknown__"
# doc.ngrams is an sql query (ugly but useful intermediate step)
# FIXME: move the counting and stoplist filtering up here
for
ngram_pack
in
doc
.
ngrams
.
all
():
todo_ngrams_per_lg
[
lgid
]
.
add
(
ngram_pack
)
# --------------------
# long loop per ngrams
for
(
lgid
,
todo_ngs
)
in
todo_ngrams_per_lg
.
items
():
# fun: word::str => stem::str
stem_it
=
stemmers
[
lgid
]
.
stem
for
ng
in
todo_ngs
:
doc_wei
=
ng
[
0
]
ngram
=
ng
[
1
]
# Ngram obj
# break if in STOPLIST
if
ngram
.
id
in
stop_ngrams_ids
:
next
lexforms
=
[
lexunit
for
lexunit
in
resplit
(
r'\W+'
,
ngram
.
terms
)]
# STEM IT, and this term's stems will become a new grouping key...
stemseq
=
" "
.
join
([
stem_it
(
lexfo
)
for
lexfo
in
lexforms
])
# ex:
# groups['post'] = {'poste':3, 'poster':5, 'postés':2...}
# groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...}
my_groups
[
stemseq
][
ngram
.
id
]
+=
doc_wei
del
todo_ngrams_per_lg
# now serializing all groups to a list of couples
ng_couples
=
[]
addcouple
=
ng_couples
.
append
for
grped_ngramids
in
my_groups
.
values
():
if
len
(
grped_ngramids
)
>
1
:
# first find most frequent term in the counter
winner_id
=
grped_ngramids
.
most_common
(
1
)[
0
][
0
]
for
ngram_id
in
grped_ngramids
:
if
ngram_id
!=
winner_id
:
addcouple
((
winner_id
,
ngram_id
))
del
my_groups
# 2) the list node
if
overwrite_id
:
# overwrite pre-existing id
the_id
=
overwrite_id
# or create the new id
else
:
the_group
=
corpus
.
add_child
(
typename
=
"GROUPLIST"
,
name
=
"Group (src:
%
s)"
%
corpus
.
name
[
0
:
10
]
)
# and save the node
session
.
add
(
the_group
)
session
.
commit
()
the_id
=
the_group
.
id
# 3) Save each grouping couple to DB thanks to Translations.save() table
ndngng_list
=
Translations
(
[(
sec
,
prim
)
for
(
prim
,
sec
)
in
ng_couples
],
just_items
=
True
)
# ...referring to the list node we just got
ndngng_list
.
save
(
the_id
)
return
the_id
gargantext/util/toolchain/parsing.py
View file @
f7d58faf
...
...
@@ -2,11 +2,16 @@ from gargantext.util.db import *
from
gargantext.models
import
*
from
gargantext.constants
import
*
from
collections
import
defaultdict
def
parse
(
corpus
):
try
:
documents_count
=
0
corpus
.
status
(
'parsing'
,
progress
=
0
)
# will gather info about languages
observed_languages
=
defaultdict
(
int
)
# retrieve resource information
for
resource
in
corpus
.
resources
():
# information about the resource
...
...
@@ -22,6 +27,7 @@ def parse(corpus):
hyperdata
=
hyperdata
,
)
session
.
add
(
document
)
observed_languages
[
hyperdata
[
"language_iso2"
]]
+=
1
if
documents_count
%
BATCH_PARSING_SIZE
==
0
:
corpus
.
status
(
'parsing'
,
progress
=
documents_count
)
corpus
.
save_hyperdata
()
...
...
@@ -29,6 +35,8 @@ def parse(corpus):
documents_count
+=
1
# update info about the resource
resource
[
'extracted'
]
=
True
# add a corpus-level info about languages
corpus
.
hyperdata
[
'languages'
]
=
observed_languages
# commit all changes
corpus
.
status
(
'parsing'
,
progress
=
documents_count
,
complete
=
True
)
corpus
.
save_hyperdata
()
...
...
gargantext/views/pages/projects.py
View file @
f7d58faf
...
...
@@ -94,23 +94,36 @@ def project(request, project_id):
)
session
.
add
(
corpus
)
session
.
commit
()
# parse_extract: fileparsing -> ngram extraction -> lists
scheduled
(
parse_extract_indexhyperdata
)(
corpus
.
id
)
#scheduled(parse_extract)(corpus.id)
# corpora within this project
corpora
=
project
.
children
(
'CORPUS'
)
.
all
()
sourcename2corpora
=
defaultdict
(
list
)
for
corpus
in
corpora
:
# we only consider the first resource of the corpus to determine its type
resource
=
corpus
.
resources
()[
0
]
resource_type_name
=
RESOURCETYPES
[
resource
[
'type'
]][
'name'
]
resources
=
corpus
.
resources
()
if
len
(
resources
):
resource
=
resources
[
0
]
resource_type_name
=
RESOURCETYPES
[
resource
[
'type'
]][
'name'
]
else
:
print
(
"(WARNING) PROJECT view: no listed resource"
)
# add some data for the viewer
corpus
.
count
=
corpus
.
children
(
'DOCUMENT'
)
.
count
()
status
=
corpus
.
status
()
if
status
is
not
None
and
not
status
[
'complete'
]:
corpus
.
status_message
=
'(in progress:
%
s,
%
d complete)'
%
(
status
[
'action'
]
.
replace
(
'_'
,
' '
),
status
[
'progress'
],
)
if
not
status
[
'error'
]:
corpus
.
status_message
=
'(in progress:
%
s,
%
d complete)'
%
(
status
[
'action'
]
.
replace
(
'_'
,
' '
),
status
[
'progress'
],
)
else
:
corpus
.
status_message
=
'(aborted: "
%
s" after
%
i docs)'
%
(
status
[
'error'
][
-
1
],
status
[
'progress'
]
)
else
:
corpus
.
status_message
=
''
# add
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment