Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
ae79736a
Commit
ae79736a
authored
Jan 18, 2017
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'testing' into stable
parents
b7ba0b62
64b1de48
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
168 additions
and
175 deletions
+168
-175
constants.py
gargantext/constants.py
+12
-8
ngram_coocs.py
gargantext/util/toolchain/ngram_coocs.py
+132
-155
distances.py
graph/distances.py
+2
-2
home.html
templates/pages/main/home.html
+22
-10
No files found.
gargantext/constants.py
View file @
ae79736a
...
...
@@ -2,33 +2,30 @@
# WARNING: to ensure consistency and retrocompatibility, lists should keep the
# initial order (ie., new elements should be appended at the end of the lists)
abstract:
---------
something between global params, constants,
configuration variables, ini file...
contents:
---------
+ db constants/ontology
+ database constants/ontology
- nodetypes
(db int <=> named types <=> python code)
+
input
low-level limits
+ low-level limits
- query size
- max upload size
- doc parsing batch size
- word extraction batch size
+ process config
+
main
process config
- resourcetypes config (~ input ontology)
- wordlist generation params
- graph creation params
- £TODO sequence of transformations "custom pipeline"
+
input process subclasses/subroutines
+
subprocess config
- crawling, import
- tagger services and functions
- parser services
...
...
@@ -83,6 +80,7 @@ NODETYPES = [
# docs subset
'FAVORITES'
,
# 15
# more scores (sorry!)
'TIRANK-LOCAL'
,
# 16
'TIRANK-GLOBAL'
,
# 17
...
...
@@ -90,6 +88,13 @@ NODETYPES = [
'RESOURCE'
,
# 19
]
def
get_nodetype_id_by_name
(
nodetype
):
'''resource :: name => resource dict'''
for
n
in
NODETYPES
:
if
str
(
n
[
"name"
])
==
str
(
sourcename
):
return
n
INDEXED_HYPERDATA
=
{
# TODO use properties during toolchain.hyperdata_indexing
# (type, convert_to_db, convert_from_db)
...
...
@@ -154,7 +159,6 @@ INDEXED_HYPERDATA = {
# user parameters----------------------------------------
USER_LANG
=
[
"fr"
,
"en"
]
# resources ---------------------------------------------
def
get_resource
(
sourcetype
):
'''resource :: type => resource dict'''
...
...
gargantext/util/toolchain/ngram_coocs.py
View file @
ae79736a
...
...
@@ -7,7 +7,7 @@ from sqlalchemy import create_engine
from
gargantext.util.lists
import
WeightedMatrix
# from gargantext.util.db import session, aliased, func
from
gargantext.util.db_cache
import
cache
from
gargantext.constants
import
DEFAULT_COOC_THRESHOLD
from
gargantext.constants
import
DEFAULT_COOC_THRESHOLD
,
NODETYPES
from
gargantext.constants
import
INDEXED_HYPERDATA
from
gargantext.util.tools
import
datetime
,
convert_to_date
...
...
@@ -53,9 +53,9 @@ def compute_coocs( corpus,
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- on_list_id: mainlist or maplist type, to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
-
TODO
stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is already provided)
- start, end: provide one or both temporal limits to filter on doc date
-
TODO
start, end: provide one or both temporal limits to filter on doc date
NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow
this convention: "2001-01-01" aka "
%
Y-
%
m-
%
d")
...
...
@@ -72,183 +72,160 @@ def compute_coocs( corpus,
connection
=
engine
.
connect
()
# string vars for our SQL query
sql_statement
=
""
doc_idx_statement
=
""
# setting work memory high to improve cache perf.
final_sql
=
"set work_mem='1GB';
\n
"
# where
# final_sql = cooc_sql + select_cooc_sql
cooc_sql
=
""
select_cooc_sql
=
""
# where
# cooc_sql = cooc_sql + ngram_filter_A_sql + ngram_filter + cooc_filter_sql
cooc_filter_sql
=
""
ngram_filter_A_sql
=
""
ngram_filter_B_sql
=
""
# 2a) prepare the document selection (normal case)
doc_idx_statement
=
"""
SELECT node_id, ngram_id
FROM nodes_ngrams
JOIN nodes
ON node_id = nodes.id
WHERE nodes.parent_id = {corpus_id}
AND nodes.typename = 4
"""
.
format
(
corpus_id
=
corpus
.
id
)
# 2b) same if document filters
if
start
or
end
:
date_type_id
=
INDEXED_HYPERDATA
[
'publication_date'
][
'id'
]
doc_idx_statement
=
"""
SELECT node_id, ngram_id
FROM nodes_ngrams
JOIN nodes
ON node_id = nodes.id
-- preparing for date filter (1/2)
JOIN nodes_hyperdata
ON nodes_hyperdata.node_id = nodes_ngrams.node_id
WHERE nodes.parent_id = {corpus_id}
AND nodes.typename = 4
-- preparing for date filter (2/2)
AND nodes_hyperdata.key = {date_type_id}
"""
.
format
(
corpus_id
=
corpus
.
id
,
date_type_id
=
date_type_id
)
if
start
:
if
not
isinstance
(
start
,
datetime
):
try
:
start
=
datetime
.
strptime
(
start
,
'
%
Y-
%
m-
%
d'
)
except
:
raise
TypeError
(
"'start' param expects datetime object or
%%
Y-
%%
m-
%%
d string"
)
# datetime object ~> date db formatted filter (2013-09-16 00:00:00+02)
start_filter
=
"AND nodes_hyperdata.value_utc >=
%
s::date"
%
start
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S
%
z'
)
# the filtering by start limit
doc_idx_statement
+=
"
\n
"
+
start_filter
if
end
:
if
not
isinstance
(
end
,
datetime
):
try
:
end
=
datetime
.
strptime
(
end
,
'
%
Y-
%
m-
%
d'
)
except
:
raise
TypeError
(
"'end' param expects datetime object or
%%
Y-
%%
m-
%%
d string"
)
# datetime object ~> date db formatted filter
end_filter
=
"AND nodes_hyperdata.value_utc <=
%
s::date"
%
end
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S
%
z'
)
# the filtering by end limit
doc_idx_statement
+=
"
\n
"
+
end_filter
# 4) prepare the synonyms
if
groupings_id
:
syn_statement
=
"""
SELECT * FROM nodes_ngrams_ngrams
WHERE node_id = {groupings_id}
"""
.
format
(
groupings_id
=
groupings_id
)
cooc_sql
+=
"""
WITH COOC as (
SELECT
COALESCE(grA.ngram1_id, wlA.ngram_id) as ngA,
COALESCE(grB.ngram1_id, wlB.ngram_id) as ngB,
COUNT(*) AS score
FROM
nodes AS n
-- /
\
-- X Y
-- SQL graph for getting the cooccurrences
"""
# 5a) MAIN DB QUERY SKELETON (no groupings) --------------------------------
if
not
groupings_id
:
sql_statement
=
"""
SELECT cooc.*
FROM (
SELECT idxA.ngram_id AS ngA,
idxB.ngram_id AS ngB,
count((idxA.ngram_id,
idxB.ngram_id)) AS cwei
-- read doc index x 2
FROM ({doc_idx}) AS idxA
JOIN ({doc_idx}) AS idxB
-- cooc <=> in same doc node
ON idxA.node_id = idxB.node_id
GROUP BY ((idxA.ngram_id,idxB.ngram_id))
) AS cooc
"""
.
format
(
doc_idx
=
doc_idx_statement
)
# --------------------------------------------------------------------------
# 5b) MAIN DB QUERY SKELETON (with groupings)
# groupings: we use additional Translation (synonyms) for ngA and ngB
else
:
sql_statement
=
"""
SELECT cooc.*
FROM (
SELECT COALESCE(synA.ngram1_id, idxA.ngram_id) AS ngA,
COALESCE(synB.ngram1_id, idxB.ngram_id) AS ngB,
count((COALESCE(synA.ngram1_id, idxA.ngram_id),
COALESCE(synB.ngram1_id, idxB.ngram_id))) AS cwei
-- read doc index x 2
FROM ({doc_idx}) AS idxA
JOIN ({doc_idx}) AS idxB
-- cooc <=> in same doc node
ON idxA.node_id = idxB.node_id
-- when idxA.ngram_id is a subform
LEFT JOIN ({synonyms}) as synA
ON synA.ngram2_id = idxA.ngram_id
-- when idxB.ngram_id is a subform
LEFT JOIN ({synonyms}) as synB
ON synB.ngram2_id = idxB.ngram_id
GROUP BY (COALESCE(synA.ngram1_id, idxA.ngram_id),
COALESCE(synB.ngram1_id, idxB.ngram_id))
) AS cooc
"""
.
format
(
doc_idx
=
doc_idx_statement
,
synonyms
=
syn_statement
)
# 6) prepare 2 x node_ngrams alias if whitelist
# 2b) stating the filters
cooc_filter_sql
=
"""
WHERE
n.typename = {nodetype_id}
AND n.parent_id = {corpus_id}
GROUP BY 1,2
-- ==
-- GROUP BY ngA, ngB
)
"""
.
format
(
nodetype_id
=
NODETYPES
.
index
(
'DOCUMENT'
)
,
corpus_id
=
corpus
.
id
)
# 3) taking the cooccurrences of ngram x2
ngram_filter_A_sql
+=
"""
-- STEP 1: X axis of the matrix
INNER JOIN nodes_ngrams
AS ngA ON ngA.node_id = n.id
--
\
--> get the occurrences node/ngram of the corpus
"""
ngram_filter_B_sql
+=
"""
-- STEP 2: Y axi of the matrix
INNER JOIN nodes_ngrams
AS ngB ON ngB.node_id = n.id
--
\
--> get the occurrences node/ngram of the corpus
"""
# 3) filter with lists (white or stop)
# on whiteList
if
on_list_id
:
sql_statement
+=
"""
JOIN nodes_ngrams AS whitelistA
ON whitelistA.ngram_id = cooc.ngA
ngram_filter_A_sql
+=
"""
INNER JOIN nodes_ngrams
AS wlA ON ngA.ngram_id = wlA.ngram_id
AND wlA.node_id = {wla_node_id}
--
\
--> filter with white/main list
"""
.
format
(
wla_node_id
=
on_list_id
)
ngram_filter_B_sql
+=
"""
INNER JOIN nodes_ngrams
AS wlB ON ngB.ngram_id = wlB.ngram_id
AND wlB.node_id = {wlb_node_id}
--
\
--> filter with white/main list
"""
.
format
(
wlb_node_id
=
on_list_id
)
# on stopList
# TODO NOT TESTED
if
stoplist_id
:
raise
(
"Stoplist not tested yet"
)
ngram_filter_A_sql
+=
"""
LEFT JOIN nodes_ngrams
AS stA ON ngA.ngram_id = stA.ngram_id
AND stA.node_id = {sta_node_id}
AND stA.ngram_id IS NULL
--
\
--> filter with stop list
"""
.
format
(
sta_node_id
=
stoplist_id
)
ngram_filter_B_sql
+=
"""
LEFT JOIN nodes_ngrams
AS stB ON ngB.ngram_id = stB.ngram_id
AND stB.node_id = {stb_node_id}
AND stB.ngram_id IS NULL
--
\
--> filter with white/main list
"""
.
format
(
stb_node_id
=
stoplist_id
)
JOIN nodes_ngrams AS whitelistB
ON whitelistB.ngram_id = cooc.ngB
"""
if
stoplist_id
:
# used for reverse join
sql_statement
+=
"""
LEFT JOIN (
SELECT * FROM nodes_ngrams
WHERE nodes_ngrams.node_id =
%
i
) AS stoplistA
ON stoplistA.ngram_id = cooc.ngA
LEFT JOIN (
SELECT * FROM nodes_ngrams
WHERE nodes_ngrams.node_id =
%
i
) AS stoplistB
ON stoplistA.ngram_id = cooc.ngA
"""
%
(
stoplist_id
,
stoplist_id
)
# 7) FILTERS
# 4) prepare the synonyms
if
groupings_id
:
ngram_filter_A_sql
+=
"""
LEFT JOIN nodes_ngrams_ngrams
AS grA ON wlA.ngram_id = grA.ngram1_id
AND grA.node_id = {groupings_id}
--
\
--> adding (joining) ngrams that are grouped
LEFT JOIN nodes_ngrams
AS wlAA ON grA.ngram2_id = wlAA.id
AND wlA.node_id = wlA.node_id
--
\
--> adding (joining) ngrams that are not grouped
--LEFT JOIN ngrams AS wlAA ON grA.ngram2_id = wlAA.id
--
\
--> for joining all synonyms even if they are not in the main list (white list)
# the inclusive threshold filter is always here
sql_statement
+=
"
\n
WHERE cooc.cwei >=
%
i"
%
threshold
"""
.
format
(
groupings_id
=
groupings_id
)
ngram_filter_B_sql
+=
"""
LEFT JOIN nodes_ngrams_ngrams
AS grB ON wlB.ngram_id = grB.ngram1_id
AND grB.node_id = {groupings_id}
--
\
--> adding (joining) ngrams that are grouped
LEFT JOIN nodes_ngrams
AS wlBB ON grB.ngram2_id = wlBB.id
AND wlB.node_id = wlB.node_id
--
\
--> adding (joining) ngrams that are not grouped
-- LEFT JOIN ngrams AS wlBB ON grB.ngram2_id = wlBB.id
--
\
--> for joining all synonyms even if they are not in the main list (white list)
"""
.
format
(
groupings_id
=
groupings_id
)
# the optional whitelist perimeters
if
on_list_id
:
sql_statement
+=
"
\n
AND whitelistA.node_id =
%
i"
%
on_list_id
sql_statement
+=
"
\n
AND whitelistB.node_id =
%
i"
%
on_list_id
if
stoplist_id
:
sql_statement
+=
"
\n
AND stoplistA.ngram_id IS NULL"
sql_statement
+=
"
\n
AND stoplistB.ngram_id IS NULL"
# 5) Buil the main COOC query
cooc_sql
+=
ngram_filter_A_sql
+
ngram_filter_B_sql
+
cooc_filter_sql
# 6) FILTERS
select_cooc_sql
=
"""
SELECT ngA, ngB, score
FROM COOC --> from the query above
"""
# the inclusive threshold filter is always here
select_cooc_sql
+=
"
\n
WHERE score >=
%
i"
%
threshold
# don't compute ngram with itself
# NB: this option is bad for main toolchain
if
diagonal_filter
:
s
ql_statement
+=
"
\n
AND ngA != ngB"
s
elect_cooc_sql
+=
"
\n
AND ngA != ngB"
# 1 filtre tenant en compte de la symétrie
# NB: this option is also bad for main toolchain
if
symmetry_filter
:
sql_statement
+=
"
\n
AND ngA <= ngB"
select_cooc_sql
+=
"
\n
AND ngA <= ngB"
# 7) Building the final query
final_sql
+=
cooc_sql
+
select_cooc_sql
# 6) EXECUTE QUERY
# ----------------
# debug
print
(
sql_statement
)
print
(
final_sql
)
# executing the SQL statement
results
=
connection
.
execute
(
sql_statement
)
results
=
connection
.
execute
(
final_sql
)
# => storage in our matrix structure
matrix
=
WeightedMatrix
(
results
)
...
...
graph/distances.py
View file @
ae79736a
...
...
@@ -63,10 +63,10 @@ def clusterByDistances( cooc_matrix
n
=
n
.
sort_index
(
inplace
=
False
)
m
=
m
.
sort_index
(
inplace
=
False
)
nodes_included
=
5
00
#int(round(size/20,0))
nodes_included
=
100
00
#int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific
=
5
00
#int(round(size/10,0))
nodes_specific
=
100
00
#int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO use the included score for the node size
...
...
templates/pages/main/home.html
View file @
ae79736a
...
...
@@ -14,25 +14,40 @@
<div
class=
"container"
>
<div
class=
"jumbotron"
>
<div
class=
"row"
>
<div
class=
"col-md-
4
content"
>
<div
class=
"col-md-
8
content"
>
<h1>
Gargantext
</h1>
<p>
A web platform to explore text-mining
</p>
<p>
<a
class=
"btn btn-primary btn-lg"
href=
"/projects"
title=
"Click and test by yourself"
>
<span
class=
"glyphicon glyphicon-hand-right"
aria-hidden=
"true"
></span>
Enter
in
Log
in
</a>
<p>
<a
class=
"btn btn-warning btn-lg"
target=
"blank"
href=
"https://iscpif.fr/services/applyforourservices/"
title=
"Fill the form to sign up"
>
<span
class=
"glyphicon glyphicon-hand-right"
aria-hidden=
"true"
></span>
Sign Up
</a>
<a
class=
"btn btn-success btn-lg"
target=
"blank"
href=
"https://iscpif.fr/gargantext/your-first-map/"
title=
"Fill the form to sign up"
>
<span
class=
"glyphicon glyphicon-hand-right"
aria-hidden=
"true"
></span>
Documentation
</a>
</p>
<span
class=
"glyphicon glyphicon-warning-sign"
aria-hidden=
"true"
></span>
<small>
<i>
Some features may not work without a javascript optimized browser (Chromium for instance).
</i>
</small>
</p>
</div>
<div
class=
"col-md-2 content"
></div>
<div
class=
"col-md-2 content"
></div>
<div
class=
"col-md-2 content"
>
<p
class=
"right"
>
<div
style=
"border:15px"
>
...
...
@@ -62,8 +77,6 @@
</div>
</div>
<div
class=
"container"
>
<div
class=
"row"
>
<div
class=
"col-md-4 content"
>
...
...
@@ -89,6 +102,5 @@
</div>
{% endblock %}
<script
type=
"text/javascript"
src=
"{% static "
lib
/
gargantext
/
help
.
js
"
%}"
></script>
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment