Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
0b94f4b0
Commit
0b94f4b0
authored
May 20, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'romain-refactoring' into unstable
parents
2eb7e07e
7c14a4ed
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
183 additions
and
93 deletions
+183
-93
_Parser.py
gargantext/util/parsers/_Parser.py
+19
-2
__init__.py
gargantext/util/toolchain/__init__.py
+4
-0
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+152
-85
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+1
-1
ngramlists.py
gargantext/views/api/ngramlists.py
+7
-5
No files found.
gargantext/util/parsers/_Parser.py
View file @
0b94f4b0
...
...
@@ -126,10 +126,27 @@ class Parser:
break
except
KeyError
:
language_keyerrors
[
key
]
=
language_symbol
# languages can find Language objects from any code iso2 or iso3
# --------------------------------------------------------------
# > languages['fr']
# <Language iso3="fra" iso2="fr" implemented="True" name="French">
# > languages['fra']
# <Language iso3="fra" iso2="fr" implemented="True" name="French">
if
language
is
not
None
:
hyperdata
[
'language_iso2'
]
=
language
.
iso2
hyperdata
[
'language_iso3'
]
=
language
.
iso3
hyperdata
[
'language_name'
]
=
language
.
name
hyperdata
[
'language_iso3'
]
=
language
.
iso3
if
(
language
.
iso2
is
not
None
):
# NB: language can be recognized through iso3 but have no iso2!!
# because there's *more* languages in iso3 codes (iso-639-3)
# exemple:
# > languages['dnj']
# <Language iso3="dnj" iso2="None" implemented="False" name="Dan">
# ----
hyperdata
[
'language_iso2'
]
=
language
.
iso2
else
:
# 'None' would become json 'null' ==> "__unknown__" more stable
hyperdata
[
'language_iso2'
]
=
"__unknown__"
elif
language_keyerrors
:
print
(
'Unrecognized language:
%
s'
%
', '
.
join
(
'
%
s="
%
s"'
%
(
key
,
value
)
for
key
,
value
in
language_keyerrors
.
items
()
...
...
gargantext/util/toolchain/__init__.py
View file @
0b94f4b0
...
...
@@ -54,6 +54,10 @@ def parse_extract_indexhyperdata(corpus):
corpus
.
status
(
'Workflow'
,
progress
=
1
)
corpus
.
save_hyperdata
()
session
.
commit
()
# FIXME: 'Workflow' will still be uncomplete when 'Index' and 'Lists' will
# get stacked into hyperdata['statuses'], but doing corpus.status()
# will return only the 1st uncomplete action (corpus.status() doesn't
# understand "subactions")
# apply actions
print
(
'CORPUS #
%
d'
%
(
corpus
.
id
))
...
...
gargantext/util/toolchain/metric_tfidf.py
View file @
0b94f4b0
...
...
@@ -9,59 +9,108 @@ FIXME: "having the same source" means we need to select inside hyperdata
"""
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
,
NodeNgramNgram
from
gargantext.util.db
import
session
,
bulk_insert
,
func
# = sqlalchemy.func like sum() or count()
from
sqlalchemy
import
text
# for query from raw SQL statement
from
gargantext.util.db
import
session
,
bulk_insert
,
aliased
,
\
func
# = sqlalchemy.func like sum() or count()
from
sqlalchemy.sql.expression
import
case
# for choice if ngram has mainform or not
from
sqlalchemy
import
distinct
# for list of unique ngram_ids within a corpus
from
math
import
log
# £TODO
# from gargantext.util.lists import WeightedContextIndex
def
compute_occs
(
corpus
,
overwrite_id
=
None
):
def
compute_occs
(
corpus
,
overwrite_id
=
None
,
groupings_id
=
None
,
):
"""
# TODO check if cumulated occs correspond to app's use cases and intention
Calculates sum of occs per ngram within corpus
(used as info in the ngrams table view)
Calculates sum of occs per ngram (or per mainform if groups) within corpus
(used as info in the ngrams table view)
? optimize ? OCCS here could be calculated simultaneously within TFIDF-CORPUS loop
? use cases ?
=> not the main score for users (their intuition for nb of docs having word)
=> but is the main weighting value for any NLP task
Parameters:
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
- groupings_id: optional id of a GROUPLIST node for this corpus
IF absent the occurrences are the sums for each ngram
IF present they're the sums for each ngram's mainform
"""
# 0) Get the groups
group_id
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"GROUPLIST"
)
.
first
()
)
# simple case : no groups
# ---------
# (the occurrences are the sums for each ngram)
if
not
groupings_id
:
# NodeNgram index
occs_q
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
func
.
sum
(
NodeNgram
.
weight
)
# <== OCCURRENCES
)
# filter docs within corpus
.
join
(
Node
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
# 1) all the doc_ids of our corpus (scope of counts for filter)
# slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
)
# for the sum
.
group_by
(
NodeNgram
.
ngram_id
)
)
# 2) our sums per ngram_id
occ_sums
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
func
.
sum
(
NodeNgram
.
weight
)
)
#.join(NodeNgramNgram, NodeNgramNgram.node_id == group_id)
.
filter
(
NodeNgram
.
node_id
.
in_
(
docids_subquery
)
)
.
group_by
(
NodeNgram
.
ngram
_id
)
.
all
()
# difficult case: with groups
# ------------
# (the occurrences are the sums for each ngram's mainform)
else
:
# sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later
)
syn
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
groupings
_id
)
.
subquery
()
)
# NodeNgram index with additional subform => mainform replacement
occs_q
=
(
session
.
query
(
# intermediate columns for debug
# -------------------------------
# NodeNgram.node_id, # document
# NodeNgram.ngram_id, # <= the occurring ngram
# NodeNgram.weight, # <= its frequency in doc
# syn.c.ngram1_id # mainform
# syn.c.ngram2_id, # subform
# ngram to count aka counted_form
# ----------------------------------
# either NodeNgram.ngram_id as before
# or mainform if it exists
case
([(
syn
.
c
.
ngram1_id
!=
None
,
syn
.
c
.
ngram1_id
)],
else_
=
NodeNgram
.
ngram_id
)
.
label
(
"counted_form"
),
# the sum itself
# --------------
func
.
sum
(
NodeNgram
.
weight
)
# <== OCCURRENCES
)
# this brings the mainform if NodeNgram.ngram_id has one in syn
.
outerjoin
(
syn
,
syn
.
c
.
ngram2_id
==
NodeNgram
.
ngram_id
)
# filter docs within corpus
.
join
(
Node
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
# for the sum
.
group_by
(
"counted_form"
)
)
occ_sums
=
occs_q
.
all
()
# example result = [(1970, 1.0), (2024, 2.0), (259, 2.0), (302, 1.0), ... ]
# ^^^^ ^^^
# ngram_id sum_wei
# ngram_id sum_wei
# OR
# counted_form
if
overwrite_id
:
# overwrite pre-existing id
...
...
@@ -92,7 +141,7 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
"""
# TODO check if cumulated tfs correspond to app's use cases and intention
Calculates tfidf ranking (cumulated tfidf
) within the
given scope
Calculates tfidf ranking (cumulated tfidf
for each ngram) within
given scope
Parameters:
- the corpus itself
...
...
@@ -100,92 +149,110 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
- local <=> frequencies counted in the current corpus
- global <=> frequencies counted in all corpora of this type
when the count_scope is global, there is another parameter:
- termset_scope: {"local" or "global"}
- local <=> output list of terms limited to the current corpus
(SELECT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
(SELECT
DISTINCT
ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
- global <=> output list of terms from all corpora of this type
!!!! (more terms)
!!!! (m
any m
ore terms)
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
corpus_docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
# MAIN QUERY SKELETON
tf_nd_query
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
func
.
sum
(
NodeNgram
.
weight
),
# tf: same as occurrences
# -----------------------
func
.
count
(
NodeNgram
.
node_id
)
# nd: n docs with term
# --------------------
)
.
group_by
(
NodeNgram
.
ngram_id
)
# optional *count_scope*: if we'll restrict the doc nodes
# -------------
# .join(countdocs_subquery,
# countdocs_subquery.c.id == NodeNgram.node_id)
# optional *termset_scope*: if we'll restrict the ngrams
# ---------------
# .join(termset_subquery,
# termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
)
# validate string params
if
count_scope
not
in
[
"local"
,
"global"
]:
raise
ValueError
(
"compute_ti_ranking: count_scope param allowed values: 'local', 'global'"
)
if
termset_scope
not
in
[
"local"
,
"global"
]:
raise
ValueError
(
"compute_ti_ranking: termset_scope param allowed values: 'local', 'global'"
)
if
count_scope
==
"local"
and
termset_scope
==
"global"
:
raise
ValueError
(
"compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too."
)
# local <=> within this corpus
if
count_scope
==
"local"
:
# All docs of this corpus
count_scope_subquery
=
corpus_docids_subquery
termset_scope_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
corpus_docids_subquery
))
countdocs_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
subquery
()
)
# both scopes are the same: no need to independantly restrict the ngrams
tf_nd_query
=
tf_nd_query
.
join
(
countdocs_subquery
,
countdocs_subquery
.
c
.
id
==
NodeNgram
.
node_id
)
# global <=> within all corpora of this source
elif
count_scope
==
"global"
:
this_source_type
=
corpus
.
resources
()[
0
][
'type'
]
# all corpora with the same source type
# (we need raw SQL query for postgres JSON operators) (TODO test speed)
same_source_corpora_query
=
(
session
.
query
(
Node
.
id
)
.
from_statement
(
text
(
"""
SELECT id FROM nodes
WHERE hyperdata->'resources' @> '[{
\"
type
\"
\
:
%
s}]'
"""
%
this_source_type
))
)
CorpusNode
=
aliased
(
Node
)
# All docs **in all corpora of the same source**
ressource_docid
s_subquery
=
(
session
countdoc
s_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
.
in_
(
same_source_corpora_query
))
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
# join on parent_id with selected corpora nodes
.
join
(
CorpusNode
,
CorpusNode
.
id
==
Node
.
parent_id
)
.
filter
(
CorpusNode
.
typename
==
"CORPUS"
)
.
filter
(
CorpusNode
.
hyperdata
[
'resources'
][
0
][
'type'
]
.
astext
==
str
(
this_source_type
))
.
subquery
()
)
count_scope_subquery
=
ressource_docids_subquery
if
termset_scope
==
"global"
:
termset_scope_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
ressource_docids_subquery
))
.
subquery
()
)
else
:
termset_scope_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
corpus_docids_subquery
))
# both scopes are the same: no need to independantly restrict the ngrams
tf_nd_query
=
tf_nd_query
.
join
(
countdocs_subquery
,
countdocs_subquery
.
c
.
id
==
NodeNgram
.
node_id
)
elif
termset_scope
==
"local"
:
# All unique terms in the original corpus
termset_subquery
=
(
session
.
query
(
distinct
(
NodeNgram
.
ngram_id
)
.
label
(
"uniq_ngid"
))
.
join
(
Node
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
subquery
()
)
# only case of independant restrictions on docs and terms
tf_nd_query
=
(
tf_nd_query
.
join
(
countdocs_subquery
,
countdocs_subquery
.
c
.
id
==
NodeNgram
.
node_id
)
.
join
(
termset_subquery
,
termset_subquery
.
c
.
uniq_ngid
==
NodeNgram
.
ngram_id
)
)
# N
total_docs
=
session
.
query
(
ressource_docid
s_subquery
)
.
count
()
total_docs
=
session
.
query
(
countdoc
s_subquery
)
.
count
()
# nb: possible to do the occurrences right now at the same time
tf_nd
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
func
.
sum
(
NodeNgram
.
weight
),
# tf: same as occnode
func
.
count
(
NodeNgram
.
node_id
)
# nd: n docs with term
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
count_scope_subquery
))
.
filter
(
NodeNgram
.
ngram_id
.
in_
(
termset_scope_subquery
))
.
group_by
(
NodeNgram
.
ngram_id
)
.
all
()
)
# result
tf_nd
=
tf_nd_query
.
all
()
# -------------------------------------------------
tfidfs
=
{}
...
...
gargantext/util/toolchain/ngrams_extraction.py
View file @
0b94f4b0
...
...
@@ -56,7 +56,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', )):
ngramsextractor
=
ngramsextractors
[
language_iso2
]
except
KeyError
:
# skip document
print
(
'Unsupported language: `
%
s`
'
%
(
language_iso2
,
))
print
(
'Unsupported language: `
%
s`
(doc #
%
i)'
%
(
language_iso2
,
document
.
id
))
# and remember that for later processes (eg stemming)
document
.
hyperdata
[
'__skipped__'
]
=
'ngrams_extraction'
document
.
save_hyperdata
()
...
...
gargantext/views/api/ngramlists.py
View file @
0b94f4b0
...
...
@@ -74,8 +74,10 @@ def _query_grouped_ngrams(groupings_id, details=False, scoring_metric_id=None):
- details: if False, send just the array of ngram_ids
if True, send triples with (ngram_id, term, scoring)
^^^^^^^
- scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
deprecated: scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
(no more OCCS counts of subforms)
"""
if
not
details
:
# simple contents
...
...
@@ -86,12 +88,12 @@ def _query_grouped_ngrams(groupings_id, details=False, scoring_metric_id=None):
.
query
(
NodeNgramNgram
.
ngram2_id
,
Ngram
.
terms
,
NodeNodeNgram
.
score
# NodeNodeNgram.score #
)
.
join
(
Ngram
,
NodeNgramNgram
.
ngram2_id
==
Ngram
.
id
)
.
join
(
NodeNodeNgram
,
NodeNgramNgram
.
ngram2_id
==
NodeNodeNgram
.
ngram_id
)
.
filter
(
NodeNodeNgram
.
node1_id
==
scoring_metric_id
)
.
order_by
(
desc
(
NodeNodeNgram
.
score
))
#
.join(NodeNodeNgram, NodeNgramNgram.ngram2_id == NodeNodeNgram.ngram_id)
#
.filter(NodeNodeNgram.node1_id == scoring_metric_id)
#
.order_by(desc(NodeNodeNgram.score))
)
# main filter
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment