Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
f2e3a714
Commit
f2e3a714
authored
Jul 26, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[Clean]
parent
0b047fee
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
1 addition
and
149 deletions
+1
-149
__init__.py
gargantext/util/crawlers/__init__.py
+0
-23
metric_specificity.py
gargantext/util/toolchain/metric_specificity.py
+0
-125
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+1
-1
No files found.
gargantext/util/crawlers/__init__.py
deleted
100644 → 0
View file @
0b047fee
import
importlib
from
gargantext.constants
import
RESOURCETYPES
from
gargantext.settings
import
DEBUG
#if DEBUG: print("Loading available Crawlers")
base_parser
=
"gargantext.util.crawlers"
for
resource
in
RESOURCETYPES
:
if
resource
[
"crawler"
]
is
not
None
:
try
:
name
=
resource
[
"crawler"
]
#crawler is type basename+"Crawler"
filename
=
name
.
replace
(
"Crawler"
,
""
)
.
lower
()
module
=
base_parser
+
".
%
s"
%
(
filename
)
importlib
.
import_module
(
module
,
name
,
locals
(),
globals
())
#if DEBUG: print("\t-", name)
except
Exception
as
e
:
print
(
"Check constants.py RESOURCETYPES declaration
%
s
\n
CRAWLER
%
s is not available for
%
s"
%
(
str
(
e
),
resource
[
"crawler"
],
resource
[
"name"
]))
#initial import
#from .cern import CernCrawler
#from .istex import ISTexCrawler
#from .pubmed import PubmedCrawler
gargantext/util/toolchain/metric_specificity.py
deleted
100644 → 0
View file @
0b047fee
"""
Computes a specificity metric from the ngram cooccurrence matrix.
+ SAVE => WeightedList => NodeNgram
"""
from
gargantext.models
import
Node
,
Ngram
,
NodeNgram
,
NodeNgramNgram
from
gargantext.util.db
import
session
,
aliased
,
func
,
bulk_insert
from
gargantext.util.lists
import
WeightedList
from
collections
import
defaultdict
from
pandas
import
DataFrame
import
pandas
as
pd
def
compute_specificity
(
corpus
,
cooc_id
=
None
,
cooc_matrix
=
None
,
overwrite_id
=
None
):
'''
Compute the specificity, simple calculus.
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- overwrite_id: optional preexisting specificity node to overwrite
'''
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
if
cooc_id
==
None
and
cooc_matrix
==
None
:
raise
TypeError
(
"compute_specificity: needs a cooc_id or cooc_matrix param"
)
elif
cooc_id
:
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
)
# no filtering: cooc already filtered on mainlist_id at creation
for
cooccurrence
in
cooccurrences
:
matrix
[
cooccurrence
.
ngram1_id
][
cooccurrence
.
ngram2_id
]
=
cooccurrence
.
weight
matrix
[
cooccurrence
.
ngram2_id
][
cooccurrence
.
ngram1_id
]
=
cooccurrence
.
weight
elif
cooc_matrix
:
# copy WeightedMatrix into local matrix structure
for
(
ngram1_id
,
ngram2_id
)
in
cooc_matrix
.
items
:
w
=
cooc_matrix
.
items
[(
ngram1_id
,
ngram2_id
)]
matrix
[
ngram1_id
][
ngram2_id
]
=
w
nb_ngrams
=
len
(
matrix
)
print
(
"SPECIFICITY: computing on
%
i ngrams"
%
nb_ngrams
)
x
=
DataFrame
(
matrix
)
.
fillna
(
0
)
# proba (x/y) ( <= on divise chaque ligne par son total)
x
=
x
/
x
.
sum
(
axis
=
1
)
# vectorisation
# d:Matrix => v: Vector (len = nb_ngrams)
# v = d.sum(axis=1) (- lui-même)
xs
=
x
.
sum
(
axis
=
1
)
-
x
ys
=
x
.
sum
(
axis
=
0
)
-
x
# top inclus ou exclus
#n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific (asc is spec, desc is generic)
v
=
(
xs
-
ys
)
/
(
2
*
(
x
.
shape
[
0
]
-
1
))
## d ##
#######
# Grenelle biodiversité kilomètres site élus île
# Grenelle 0 0 4 0 0 0
# biodiversité 0 0 0 0 4 0
# kilomètres 4 0 0 0 4 0
# site 0 0 0 0 4 6
# élus 0 4 4 4 0 0
# île 0 0 0 6 0 0
## d.sum(axis=1) ##
###################
# Grenelle 4
# biodiversité 4
# kilomètres 8
# site 10
# élus 12
# île 6
# résultat temporaire
# -------------------
# pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
# (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
# TODO analyser la cohérence math ET sem de cet indicateur
#v.sort_values(inplace=True)
# [ ('biodiversité' , 0.333 ),
# ('Grenelle' , 0.5 ),
# ('île' , 0.599 ),
# ('kilomètres' , 1.333 ),
# ('site' , 1.333 ),
# ('élus' , 1.899 ) ]
# ----------------
# specificity node
if
overwrite_id
:
# overwrite pre-existing id
the_id
=
overwrite_id
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
the_id
)
.
delete
()
session
.
commit
()
else
:
specnode
=
corpus
.
add_child
(
typename
=
"SPECIFICITY"
,
name
=
"Specif (in:
%
s)"
%
corpus
.
id
)
session
.
add
(
specnode
)
session
.
commit
()
the_id
=
specnode
.
id
# print(v)
pd
.
options
.
display
.
float_format
=
'${:,.2f}'
.
format
if
not
v
.
empty
:
data
=
WeightedList
(
zip
(
v
.
index
.
tolist
()
,
v
.
values
.
tolist
()[
0
]
)
)
data
.
save
(
the_id
)
else
:
print
(
"WARNING: had no terms in COOCS => empty SPECIFICITY node"
)
return
(
the_id
)
gargantext/util/toolchain/ngrams_extraction.py
View file @
f2e3a714
...
...
@@ -48,7 +48,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
ngrams_data
=
set
()
# extract ngrams
resource_type_index
=
corpus
.
resources
()[
0
][
'type'
]
documents_count
=
0
resource_type
=
RESOURCETYPES
[
resource_type_index
]
default_language_iso2
=
resource_type
[
'default_language'
]
for
documents_count
,
document
in
enumerate
(
corpus
.
children
(
'DOCUMENT'
)):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment