Commit ae79736a authored by delanoe's avatar delanoe

Merge branch 'testing' into stable

parents b7ba0b62 64b1de48
......@@ -2,33 +2,30 @@
# WARNING: to ensure consistency and retrocompatibility, lists should keep the
# initial order (ie., new elements should be appended at the end of the lists)
abstract:
---------
something between global params, constants,
configuration variables, ini file...
contents:
---------
+ db constants/ontology
+ database constants/ontology
- nodetypes
(db int <=> named types <=> python code)
+ input low-level limits
+ low-level limits
- query size
- max upload size
- doc parsing batch size
- word extraction batch size
+ process config
+ main process config
- resourcetypes config (~ input ontology)
- wordlist generation params
- graph creation params
- £TODO sequence of transformations "custom pipeline"
+ input process subclasses/subroutines
+ subprocess config
- crawling, import
- tagger services and functions
- parser services
......@@ -83,6 +80,7 @@ NODETYPES = [
# docs subset
'FAVORITES', # 15
# more scores (sorry!)
'TIRANK-LOCAL', # 16
'TIRANK-GLOBAL', # 17
......@@ -90,6 +88,13 @@ NODETYPES = [
'RESOURCE', # 19
]
def get_nodetype_id_by_name(nodetype):
'''resource :: name => resource dict'''
for n in NODETYPES :
if str(n["name"]) == str(sourcename):
return n
INDEXED_HYPERDATA = {
# TODO use properties during toolchain.hyperdata_indexing
# (type, convert_to_db, convert_from_db)
......@@ -154,7 +159,6 @@ INDEXED_HYPERDATA = {
# user parameters----------------------------------------
USER_LANG = ["fr", "en"]
# resources ---------------------------------------------
def get_resource(sourcetype):
'''resource :: type => resource dict'''
......
......@@ -7,7 +7,7 @@ from sqlalchemy import create_engine
from gargantext.util.lists import WeightedMatrix
# from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD
from gargantext.constants import DEFAULT_COOC_THRESHOLD, NODETYPES
from gargantext.constants import INDEXED_HYPERDATA
from gargantext.util.tools import datetime, convert_to_date
......@@ -53,9 +53,9 @@ def compute_coocs( corpus,
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- on_list_id: mainlist or maplist type, to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
- TODO stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is already provided)
- start, end: provide one or both temporal limits to filter on doc date
- TODO start, end: provide one or both temporal limits to filter on doc date
NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow
this convention: "2001-01-01" aka "%Y-%m-%d")
......@@ -72,183 +72,160 @@ def compute_coocs( corpus,
connection = engine.connect()
# string vars for our SQL query
sql_statement = ""
doc_idx_statement = ""
# setting work memory high to improve cache perf.
final_sql = "set work_mem='1GB'; \n"
# where
# final_sql = cooc_sql + select_cooc_sql
cooc_sql = ""
select_cooc_sql = ""
# where
# cooc_sql = cooc_sql + ngram_filter_A_sql + ngram_filter + cooc_filter_sql
cooc_filter_sql = ""
ngram_filter_A_sql = ""
ngram_filter_B_sql = ""
# 2a) prepare the document selection (normal case)
doc_idx_statement = """
SELECT node_id, ngram_id
FROM nodes_ngrams
JOIN nodes
ON node_id = nodes.id
WHERE nodes.parent_id = {corpus_id}
AND nodes.typename = 4
""".format(corpus_id=corpus.id)
# 2b) same if document filters
if start or end:
date_type_id = INDEXED_HYPERDATA['publication_date']['id']
doc_idx_statement = """
SELECT node_id, ngram_id
FROM nodes_ngrams
JOIN nodes
ON node_id = nodes.id
-- preparing for date filter (1/2)
JOIN nodes_hyperdata
ON nodes_hyperdata.node_id = nodes_ngrams.node_id
WHERE nodes.parent_id = {corpus_id}
AND nodes.typename = 4
-- preparing for date filter (2/2)
AND nodes_hyperdata.key = {date_type_id}
""".format(corpus_id=corpus.id, date_type_id = date_type_id)
if start:
if not isinstance(start, datetime):
try:
start = datetime.strptime(start, '%Y-%m-%d')
except:
raise TypeError("'start' param expects datetime object or %%Y-%%m-%%d string")
# datetime object ~> date db formatted filter (2013-09-16 00:00:00+02)
start_filter = "AND nodes_hyperdata.value_utc >= %s::date" % start.strftime('%Y-%m-%d %H:%M:%S%z')
# the filtering by start limit
doc_idx_statement += "\n" + start_filter
if end:
if not isinstance(end, datetime):
try:
end = datetime.strptime(end, '%Y-%m-%d')
except:
raise TypeError("'end' param expects datetime object or %%Y-%%m-%%d string")
# datetime object ~> date db formatted filter
end_filter = "AND nodes_hyperdata.value_utc <= %s::date" % end.strftime('%Y-%m-%d %H:%M:%S%z')
# the filtering by end limit
doc_idx_statement += "\n" + end_filter
# 4) prepare the synonyms
if groupings_id:
syn_statement = """
SELECT * FROM nodes_ngrams_ngrams
WHERE node_id = {groupings_id}
""".format(groupings_id = groupings_id)
cooc_sql += """
WITH COOC as (
SELECT
COALESCE(grA.ngram1_id, wlA.ngram_id) as ngA,
COALESCE(grB.ngram1_id, wlB.ngram_id) as ngB,
COUNT(*) AS score
FROM
nodes AS n
-- / \
-- X Y
-- SQL graph for getting the cooccurrences
"""
# 5a) MAIN DB QUERY SKELETON (no groupings) --------------------------------
if not groupings_id:
sql_statement = """
SELECT cooc.*
FROM (
SELECT idxA.ngram_id AS ngA,
idxB.ngram_id AS ngB,
count((idxA.ngram_id,
idxB.ngram_id)) AS cwei
-- read doc index x 2
FROM ({doc_idx}) AS idxA
JOIN ({doc_idx}) AS idxB
-- cooc <=> in same doc node
ON idxA.node_id = idxB.node_id
GROUP BY ((idxA.ngram_id,idxB.ngram_id))
) AS cooc
""".format(doc_idx = doc_idx_statement)
# --------------------------------------------------------------------------
# 5b) MAIN DB QUERY SKELETON (with groupings)
# groupings: we use additional Translation (synonyms) for ngA and ngB
else:
sql_statement = """
SELECT cooc.*
FROM (
SELECT COALESCE(synA.ngram1_id, idxA.ngram_id) AS ngA,
COALESCE(synB.ngram1_id, idxB.ngram_id) AS ngB,
count((COALESCE(synA.ngram1_id, idxA.ngram_id),
COALESCE(synB.ngram1_id, idxB.ngram_id))) AS cwei
-- read doc index x 2
FROM ({doc_idx}) AS idxA
JOIN ({doc_idx}) AS idxB
-- cooc <=> in same doc node
ON idxA.node_id = idxB.node_id
-- when idxA.ngram_id is a subform
LEFT JOIN ({synonyms}) as synA
ON synA.ngram2_id = idxA.ngram_id
-- when idxB.ngram_id is a subform
LEFT JOIN ({synonyms}) as synB
ON synB.ngram2_id = idxB.ngram_id
GROUP BY (COALESCE(synA.ngram1_id, idxA.ngram_id),
COALESCE(synB.ngram1_id, idxB.ngram_id))
) AS cooc
""".format(doc_idx = doc_idx_statement,
synonyms = syn_statement)
# 6) prepare 2 x node_ngrams alias if whitelist
# 2b) stating the filters
cooc_filter_sql = """
WHERE
n.typename = {nodetype_id}
AND n.parent_id = {corpus_id}
GROUP BY 1,2
-- ==
-- GROUP BY ngA, ngB
)
""".format( nodetype_id = NODETYPES.index('DOCUMENT')
, corpus_id=corpus.id
)
# 3) taking the cooccurrences of ngram x2
ngram_filter_A_sql += """
-- STEP 1: X axis of the matrix
INNER JOIN nodes_ngrams
AS ngA ON ngA.node_id = n.id
-- \--> get the occurrences node/ngram of the corpus
"""
ngram_filter_B_sql += """
-- STEP 2: Y axi of the matrix
INNER JOIN nodes_ngrams
AS ngB ON ngB.node_id = n.id
-- \--> get the occurrences node/ngram of the corpus
"""
# 3) filter with lists (white or stop)
# on whiteList
if on_list_id:
sql_statement +="""
JOIN nodes_ngrams AS whitelistA
ON whitelistA.ngram_id = cooc.ngA
ngram_filter_A_sql += """
INNER JOIN nodes_ngrams
AS wlA ON ngA.ngram_id = wlA.ngram_id
AND wlA.node_id = {wla_node_id}
-- \--> filter with white/main list
""".format(wla_node_id = on_list_id)
ngram_filter_B_sql += """
INNER JOIN nodes_ngrams
AS wlB ON ngB.ngram_id = wlB.ngram_id
AND wlB.node_id = {wlb_node_id}
-- \--> filter with white/main list
""".format(wlb_node_id = on_list_id)
# on stopList
# TODO NOT TESTED
if stoplist_id:
raise("Stoplist not tested yet")
ngram_filter_A_sql += """
LEFT JOIN nodes_ngrams
AS stA ON ngA.ngram_id = stA.ngram_id
AND stA.node_id = {sta_node_id}
AND stA.ngram_id IS NULL
-- \--> filter with stop list
""".format(sta_node_id = stoplist_id)
ngram_filter_B_sql += """
LEFT JOIN nodes_ngrams
AS stB ON ngB.ngram_id = stB.ngram_id
AND stB.node_id = {stb_node_id}
AND stB.ngram_id IS NULL
-- \--> filter with white/main list
""".format(stb_node_id = stoplist_id)
JOIN nodes_ngrams AS whitelistB
ON whitelistB.ngram_id = cooc.ngB
"""
if stoplist_id:
# used for reverse join
sql_statement +="""
LEFT JOIN (
SELECT * FROM nodes_ngrams
WHERE nodes_ngrams.node_id = %i
) AS stoplistA
ON stoplistA.ngram_id = cooc.ngA
LEFT JOIN (
SELECT * FROM nodes_ngrams
WHERE nodes_ngrams.node_id = %i
) AS stoplistB
ON stoplistA.ngram_id = cooc.ngA
""" % (stoplist_id, stoplist_id)
# 7) FILTERS
# 4) prepare the synonyms
if groupings_id:
ngram_filter_A_sql += """
LEFT JOIN nodes_ngrams_ngrams
AS grA ON wlA.ngram_id = grA.ngram1_id
AND grA.node_id = {groupings_id}
-- \--> adding (joining) ngrams that are grouped
LEFT JOIN nodes_ngrams
AS wlAA ON grA.ngram2_id = wlAA.id
AND wlA.node_id = wlA.node_id
-- \--> adding (joining) ngrams that are not grouped
--LEFT JOIN ngrams AS wlAA ON grA.ngram2_id = wlAA.id
-- \--> for joining all synonyms even if they are not in the main list (white list)
# the inclusive threshold filter is always here
sql_statement += "\n WHERE cooc.cwei >= %i" % threshold
""".format(groupings_id = groupings_id)
ngram_filter_B_sql += """
LEFT JOIN nodes_ngrams_ngrams
AS grB ON wlB.ngram_id = grB.ngram1_id
AND grB.node_id = {groupings_id}
-- \--> adding (joining) ngrams that are grouped
LEFT JOIN nodes_ngrams
AS wlBB ON grB.ngram2_id = wlBB.id
AND wlB.node_id = wlB.node_id
-- \--> adding (joining) ngrams that are not grouped
-- LEFT JOIN ngrams AS wlBB ON grB.ngram2_id = wlBB.id
-- \--> for joining all synonyms even if they are not in the main list (white list)
""".format(groupings_id = groupings_id)
# the optional whitelist perimeters
if on_list_id:
sql_statement += "\n AND whitelistA.node_id = %i" % on_list_id
sql_statement += "\n AND whitelistB.node_id = %i" % on_list_id
if stoplist_id:
sql_statement += "\n AND stoplistA.ngram_id IS NULL"
sql_statement += "\n AND stoplistB.ngram_id IS NULL"
# 5) Buil the main COOC query
cooc_sql += ngram_filter_A_sql + ngram_filter_B_sql + cooc_filter_sql
# 6) FILTERS
select_cooc_sql = """
SELECT ngA, ngB, score
FROM COOC --> from the query above
"""
# the inclusive threshold filter is always here
select_cooc_sql += "\n WHERE score >= %i" % threshold
# don't compute ngram with itself
# NB: this option is bad for main toolchain
if diagonal_filter:
sql_statement += "\n AND ngA != ngB"
select_cooc_sql += "\n AND ngA != ngB"
# 1 filtre tenant en compte de la symétrie
# NB: this option is also bad for main toolchain
if symmetry_filter:
sql_statement += "\n AND ngA <= ngB"
select_cooc_sql += "\n AND ngA <= ngB"
# 7) Building the final query
final_sql += cooc_sql + select_cooc_sql
# 6) EXECUTE QUERY
# ----------------
# debug
print(sql_statement)
print(final_sql)
# executing the SQL statement
results = connection.execute(sql_statement)
results = connection.execute(final_sql)
# => storage in our matrix structure
matrix = WeightedMatrix(results)
......
......@@ -63,10 +63,10 @@ def clusterByDistances( cooc_matrix
n = n.sort_index(inplace=False)
m = m.sort_index(inplace=False)
nodes_included = 500 #int(round(size/20,0))
nodes_included = 10000 #int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific = 500 #int(round(size/10,0))
nodes_specific = 10000 #int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO use the included score for the node size
......
......@@ -14,25 +14,40 @@
<div class="container">
<div class="jumbotron">
<div class="row">
<div class="col-md-4 content">
<div class="col-md-8 content">
<h1>Gargantext</h1>
<p>A web platform to explore text-mining</p>
<p>
<a class="btn btn-primary btn-lg" href="/projects" title="Click and test by yourself">
<span class="glyphicon glyphicon-hand-right" aria-hidden="true"></span>
Enter in
Log in
</a>
<p>
<a class="btn btn-warning btn-lg" target="blank" href="https://iscpif.fr/services/applyforourservices/" title="Fill the form to sign up">
<span class="glyphicon glyphicon-hand-right" aria-hidden="true"></span>
Sign Up
</a>
<a class="btn btn-success btn-lg" target="blank" href="https://iscpif.fr/gargantext/your-first-map/" title="Fill the form to sign up">
<span class="glyphicon glyphicon-hand-right" aria-hidden="true"></span>
Documentation
</a>
</p>
<span class="glyphicon glyphicon-warning-sign" aria-hidden="true"></span>
<small>
<i>
Some features may not work without a javascript optimized browser (Chromium for instance).
</i>
</small>
</p>
</div>
<div class="col-md-2 content"></div>
<div class="col-md-2 content"></div>
<div class="col-md-2 content">
<p class="right">
<div style="border:15px">
......@@ -62,8 +77,6 @@
</div>
</div>
<div class="container">
<div class="row">
<div class="col-md-4 content">
......@@ -89,6 +102,5 @@
</div>
{% endblock %}
<script type="text/javascript" src="{% static "lib/gargantext/help.js" %}"></script>
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment