......@@ -2,6 +2,8 @@
* Guided Tour
* Sources form highlighting crawlers
## Version 3.0.7
* Alembic implemented to manage database migrations
## Version
* REPEC Crawler (connection with
# A generic, single database configuration.
# path to migration scripts
script_location = alembic
# template used to generate migration files
# file_template = %%(rev)s_%%(slug)s
# timezone to use when rendering the date
# within the migration file as well as the filename.
# string value is passed to
# leave blank for localtime
# timezone =
# max length of characters to apply to the
# "slug" field
#truncate_slug_length = 40
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false
# version location specification; this defaults
# to alembic/versions. When using multiple version
# directories, initial revisions must be specified with --version-path
# version_locations = %(here)s/bar %(here)s/bat alembic/versions
# the output encoding used when revision files
# are written from
# output_encoding = utf-8
# XXX For database access configuration, see alembic/
#sqlalchemy.url = driver://user:pass@localhost/dbname
tables = django_* celery_* djcelery_* auth_*
# Logging configuration
keys = root,sqlalchemy,alembic
keys = console
keys = generic
level = WARN
handlers = console
qualname =
level = WARN
handlers =
qualname = sqlalchemy.engine
level = INFO
handlers =
qualname = alembic
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S
Alembic must be installed in the virtualenv in order to use right python paths,
so it's installed with pip. Commands described in this little documentation
must be executed from gargantext root directory, ie. /srv/gargantext.
Keep in mind that Alembic only handles SQLAlchemy models: tables created from
Django ORM must be put out of Alembic sight. See [alembic:exclude] section in
# To upgrade a database populated before Alembic usage in Gargantext,
# don't forget to tell Alembic your current version before to run
# "upgrade head" command. If you don't want to do this, you can of course
# drop your database and really start from scratch.
alembic stamp 601e9d9baa4c
alembic upgrade head
alembic downgrade base
alembic revision -m "Message for this migration"
# A migration script is then created in alembic/versions directory. For
# example alembic/versions/
# where 3adcc9a56557 is the revision id generated by Alembic.
# This script must be edited to write the migration itself, mainly
# in `upgrade` and `downgrade` functions. See Alembic documentation for
# further details.
alembic revision --autogenerate -m "Message for this migration"
# Alembic should generate a script reflecting changes already made in
# database. However it is always a good idea to check it and edit it
# manually, Alembic is not always accurate and can't see all alterations.
# It should work with basic changes such as model or column creation. See
from __future__ import with_statement
from alembic import context
from sqlalchemy import engine_from_config, pool
from logging.config import fileConfig
import re
# Add projet root directory in path and setup Django...
import os
import django
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
# be able to import gargantext.
from gargantext import settings, models
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
config.set_main_option("sqlalchemy.url", settings.DATABASES['default']['URL'])
# Interpret the config file for Python logging.
# This line sets up loggers basically.
# add your model's MetaData object here
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
target_metadata = models.Base.metadata
# other values from the config, defined by the needs of,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.
# Inspired from
def exclude_tables_from_config(config):
tables = config.get("tables", '').replace('*', '.*').split(' ')
pattern = '|'.join(tables)
return re.compile(pattern)
exclude_tables = exclude_tables_from_config(config.get_section('alembic:exclude'))
def include_object(obj, name, typ, reflected, compare_to):
if typ == "table" and exclude_tables.match(name):
return False
return True
def run_migrations_offline():
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
url = config.get_main_option("sqlalchemy.url")
url=url, target_metadata=target_metadata, literal_binds=True,
with context.begin_transaction():
def run_migrations_online():
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
connectable = engine_from_config(
with connectable.connect() as connection:
with context.begin_transaction():
if context.is_offline_mode():
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
from alembic import op
import sqlalchemy as sa
import gargantext
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision = ${repr(up_revision)}
down_revision = ${repr(down_revision)}
branch_labels = ${repr(branch_labels)}
depends_on = ${repr(depends_on)}
def upgrade():
${upgrades if upgrades else "pass"}
def downgrade():
${downgrades if downgrades else "pass"}
"""Put a timezone on
Revision ID: 08230100f262
Revises: 601e9d9baa4c
Create Date: 2017-07-06 13:47:10.788569
from alembic import op
import sqlalchemy as sa
import gargantext
# revision identifiers, used by Alembic.
revision = '08230100f262'
down_revision = '601e9d9baa4c'
branch_labels = None
depends_on = None
def upgrade():
op.alter_column('nodes', 'date', type_=sa.DateTime(timezone=True))
def downgrade():
op.alter_column('nodes', 'date', type_=sa.DateTime(timezone=False))
"""Add OCC_HIST & OCC_HIST_PART functions
Revision ID: 601e9d9baa4c
Revises: 932dbf3e8c43
Create Date: 2017-07-06 10:52:16.161118
from alembic import op
import sqlalchemy as sa
from import ReplaceableObject
# revision identifiers, used by Alembic.
revision = '601e9d9baa4c'
down_revision = '932dbf3e8c43'
branch_labels = None
depends_on = None
# -- OCC_HIST_PART :: -> -> Start -> End
occ_hist_part = ReplaceableObject(
"OCC_HIST_PART(int, int, timestamp, timestamp)",
RETURNS TABLE (ng_id int, score float8)
AS $$
COALESCE(gr.ngram1_id, ng1.ngram_id) as ng_id,
SUM(ng1.weight) as score
from nodes n
INNER JOIN nodes as n1 ON =
INNER JOIN nodes_ngrams ng1 ON ng1.node_id =
-- Limit with timestamps: ]start, end]
INNER JOIN nodes_hyperdata nh1 ON nh1.node_id =
AND nh1.value_utc > $3
AND nh1.value_utc <= $4
-- Group List
LEFT JOIN nodes_ngrams_ngrams gr ON ng1.ngram_id = gr.ngram2_id
AND gr.node_id = $2
n.typename = 4
AND n.parent_id = $1
# -- OCC_HIST :: -> -> -> Start -> EndFirst -> EndLast
# -- SELECT * FROM OCC_HIST(182856, 183859, 183866, '1800-03-15 17:00:00+01', '2000-03-15 17:00:00+01', '2017-03-15 17:00:00+01')
occ_hist = ReplaceableObject(
"OCC_HIST(int, int, int, timestamp, timestamp, timestamp)",
RETURNS TABLE (ng_id int, score numeric)
AS $$
WITH OCC1 as (SELECT * from OCC_HIST_PART($1, $2, $4, $5))
, OCC2 as (SELECT * from OCC_HIST_PART($1, $2, $5, $6))
, GROWTH as (SELECT ml.ngram_id as ngram_id
, COALESCE(OCC1.score, null) as score1
, COALESCE(OCC2.score, null) as score2
FROM nodes_ngrams ml
LEFT JOIN OCC1 ON OCC1.ng_id = ml.ngram_id
LEFT JOIN OCC2 ON OCC2.ng_id = ml.ngram_id
WHERE ml.node_id = $3
ORDER by score2 DESC)
SELECT ngram_id, COALESCE(ROUND(CAST((100 * (score2 - score1) / COALESCE((score2 + score1), 1)) as numeric), 2), 0) from GROWTH
# -- BEHAVIORAL TEST (should be equal to occ in terms table)
# -- WITH OCC as (SELECT * from OCC_HIST(182856, 183859, '1800-03-15 17:00:00+01', '2300-03-15 17:00:00+01'))
# -- SELECT ng_id, score from OCC
# -- INNER JOIN nodes_ngrams ml on ml.ngram_id = ng_id
# -- AND ml.node_id = 183866
# -- ORDER BY score DESC;
def upgrade():
def downgrade():
"""Initial migration
Revision ID: 932dbf3e8c43
Create Date: 2017-07-05 16:41:23.951422
from alembic import op
import sqlalchemy as sa
import gargantext
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = '932dbf3e8c43'
down_revision = None
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('user1_id', sa.Integer(), nullable=True),
sa.Column('user2_id', sa.Integer(), nullable=True),
sa.Column('is_blocked', sa.Boolean(), nullable=True),
sa.Column('date_creation', sa.DateTime(), nullable=True),
sa.ForeignKeyConstraint(['user1_id'], [''], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['user2_id'], [''], ondelete='CASCADE'),
sa.UniqueConstraint('user1_id', 'user2_id')
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('terms', sa.String(length=255), nullable=True),
sa.Column('n', sa.Integer(), nullable=True),
op.create_index('ngrams_id_n_idx', 'ngrams', ['id', 'n'], unique=False)
op.create_index('ngrams_n_idx', 'ngrams', ['n'], unique=False)
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('typename', gargantext.models.nodes.NodeType(), nullable=True),
sa.Column('user_id', sa.Integer(), nullable=True),
sa.Column('parent_id', sa.Integer(), nullable=True),
sa.Column('name', sa.String(length=255), nullable=True),
sa.Column('date', sa.DateTime(), nullable=True),
sa.Column('hyperdata', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
sa.ForeignKeyConstraint(['parent_id'], [''], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['user_id'], [''], ondelete='CASCADE'),
op.create_index(op.f('ix_nodes_typename'), 'nodes', ['typename'], unique=False)
op.create_index('nodes_hyperdata_idx', 'nodes', ['hyperdata'], unique=False)
op.create_index('nodes_user_id_typename_parent_id_idx', 'nodes', ['user_id', 'typename', 'parent_id'], unique=False)
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('node_id', sa.Integer(), nullable=True),
sa.Column('key', gargantext.models.hyperdata.HyperdataKey(), nullable=True),
sa.Column('value_int', sa.Integer(), nullable=True),
sa.Column('value_flt', postgresql.DOUBLE_PRECISION(), nullable=True),
sa.Column('value_utc', sa.DateTime(timezone=True), nullable=True),
sa.Column('value_str', sa.String(length=255), nullable=True),
sa.Column('value_txt', sa.Text(), nullable=True),
sa.ForeignKeyConstraint(['node_id'], [''], ondelete='CASCADE'),
op.create_index(op.f('ix_nodes_hyperdata_value_flt'), 'nodes_hyperdata', ['value_flt'], unique=False)
op.create_index(op.f('ix_nodes_hyperdata_value_int'), 'nodes_hyperdata', ['value_int'], unique=False)
op.create_index(op.f('ix_nodes_hyperdata_value_str'), 'nodes_hyperdata', ['value_str'], unique=False)
op.create_index(op.f('ix_nodes_hyperdata_value_utc'), 'nodes_hyperdata', ['value_utc'], unique=False)
op.create_index('nodes_hyperdata_node_id_key_idx', 'nodes_hyperdata', ['node_id', 'key'], unique=False)
op.create_index('nodes_hyperdata_node_id_key_value_flt_idx', 'nodes_hyperdata', ['node_id', 'key', 'value_flt'], unique=False)
op.create_index('nodes_hyperdata_node_id_key_value_int_idx', 'nodes_hyperdata', ['node_id', 'key', 'value_int'], unique=False)
op.create_index('nodes_hyperdata_node_id_key_value_str_idx', 'nodes_hyperdata', ['node_id', 'key', 'value_str'], unique=False)
op.create_index('nodes_hyperdata_node_id_key_value_utc_idx', 'nodes_hyperdata', ['node_id', 'key', 'value_utc'], unique=False)
op.create_index('nodes_hyperdata_node_id_value_utc_idx', 'nodes_hyperdata', ['node_id', 'value_utc'], unique=False)
sa.Column('node_id', sa.Integer(), nullable=False),
sa.Column('ngram_id', sa.Integer(), nullable=False),
sa.Column('weight', sa.Float(), nullable=True),
sa.ForeignKeyConstraint(['ngram_id'], [''], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['node_id'], [''], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('node_id', 'ngram_id')
op.create_index('nodes_ngrams_ngram_id_idx', 'nodes_ngrams', ['ngram_id'], unique=False)
op.create_index('nodes_ngrams_node_id_idx', 'nodes_ngrams', ['node_id'], unique=False)
op.create_index('nodes_ngrams_node_id_ngram_id_idx', 'nodes_ngrams', ['node_id', 'ngram_id'], unique=False)
sa.Column('node_id', sa.Integer(), nullable=False),
sa.Column('ngram1_id', sa.Integer(), nullable=False),
sa.Column('ngram2_id', sa.Integer(), nullable=False),
sa.Column('weight', sa.Float(precision=24), nullable=True),
sa.ForeignKeyConstraint(['ngram1_id'], [''], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['ngram2_id'], [''], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['node_id'], [''], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('node_id', 'ngram1_id', 'ngram2_id')
op.create_index('nodes_ngrams_ngrams_ngram1_id_idx', 'nodes_ngrams_ngrams', ['ngram1_id'], unique=False)
op.create_index('nodes_ngrams_ngrams_ngram2_id_idx', 'nodes_ngrams_ngrams', ['ngram2_id'], unique=False)
op.create_index('nodes_ngrams_ngrams_node_id_idx', 'nodes_ngrams_ngrams', ['node_id'], unique=False)
op.create_index('nodes_ngrams_ngrams_node_id_ngram1_id_ngram2_id_idx', 'nodes_ngrams_ngrams', ['node_id', 'ngram1_id', 'ngram2_id'], unique=False)
sa.Column('node1_id', sa.Integer(), nullable=False),
sa.Column('node2_id', sa.Integer(), nullable=False),
sa.Column('score', sa.Float(precision=24), nullable=True),
sa.ForeignKeyConstraint(['node1_id'], [''], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['node2_id'], [''], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('node1_id', 'node2_id')
op.create_index('nodes_nodes_node1_id_node2_id_idx', 'nodes_nodes', ['node1_id', 'node2_id'], unique=False)
sa.Column('node1_id', sa.Integer(), nullable=False),
sa.Column('node2_id', sa.Integer(), nullable=False),
sa.Column('ngram_id', sa.Integer(), nullable=False),
sa.Column('score', sa.Float(precision=24), nullable=True),
sa.ForeignKeyConstraint(['ngram_id'], [''], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['node1_id'], [''], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['node2_id'], [''], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('node1_id', 'node2_id', 'ngram_id')
op.create_index('nodes_nodes_ngrams_node1_id_idx', 'nodes_nodes_ngrams', ['node1_id'], unique=False)
op.create_index('nodes_nodes_ngrams_node2_id_idx', 'nodes_nodes_ngrams', ['node2_id'], unique=False)
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index('nodes_nodes_ngrams_node2_id_idx', table_name='nodes_nodes_ngrams')
op.drop_index('nodes_nodes_ngrams_node1_id_idx', table_name='nodes_nodes_ngrams')
op.drop_index('nodes_nodes_node1_id_node2_id_idx', table_name='nodes_nodes')
op.drop_index('nodes_ngrams_ngrams_node_id_ngram1_id_ngram2_id_idx', table_name='nodes_ngrams_ngrams')
op.drop_index('nodes_ngrams_ngrams_node_id_idx', table_name='nodes_ngrams_ngrams')
op.drop_index('nodes_ngrams_ngrams_ngram2_id_idx', table_name='nodes_ngrams_ngrams')
op.drop_index('nodes_ngrams_ngrams_ngram1_id_idx', table_name='nodes_ngrams_ngrams')
op.drop_index('nodes_ngrams_node_id_ngram_id_idx', table_name='nodes_ngrams')
op.drop_index('nodes_ngrams_node_id_idx', table_name='nodes_ngrams')
op.drop_index('nodes_ngrams_ngram_id_idx', table_name='nodes_ngrams')
op.drop_index('nodes_hyperdata_node_id_value_utc_idx', table_name='nodes_hyperdata')
op.drop_index('nodes_hyperdata_node_id_key_value_utc_idx', table_name='nodes_hyperdata')
op.drop_index('nodes_hyperdata_node_id_key_value_str_idx', table_name='nodes_hyperdata')
op.drop_index('nodes_hyperdata_node_id_key_value_int_idx', table_name='nodes_hyperdata')
op.drop_index('nodes_hyperdata_node_id_key_value_flt_idx', table_name='nodes_hyperdata')
op.drop_index('nodes_hyperdata_node_id_key_idx', table_name='nodes_hyperdata')
op.drop_index(op.f('ix_nodes_hyperdata_value_utc'), table_name='nodes_hyperdata')
op.drop_index(op.f('ix_nodes_hyperdata_value_str'), table_name='nodes_hyperdata')
op.drop_index(op.f('ix_nodes_hyperdata_value_int'), table_name='nodes_hyperdata')
op.drop_index(op.f('ix_nodes_hyperdata_value_flt'), table_name='nodes_hyperdata')
op.drop_index('nodes_user_id_typename_parent_id_idx', table_name='nodes')
op.drop_index('nodes_hyperdata_idx', table_name='nodes')
op.drop_index(op.f('ix_nodes_typename'), table_name='nodes')
op.drop_index('ngrams_n_idx', table_name='ngrams')
op.drop_index('ngrams_id_n_idx', table_name='ngrams')
# ### end Alembic commands ###
#!/usr/bin/env python
import sys
import os
if __name__ == "__main__":
# Django settings
dirname = os.path.dirname(os.path.realpath(__file__))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext.settings")
# initialize Django application
from django.core.wsgi import get_wsgi_application
application = get_wsgi_application()
# retrieve Django models
import django.apps
django_models = django.apps.apps.get_models()
django_models_names = set(model._meta.db_table for model in django_models)
# migrate SQLAlchemy models
from gargantext import models
from gargantext.util.db import Base, engine
sqla_models_names = (
model for model in Base.metadata.tables.keys()
if model not in django_models_names
sqla_models = (
for model_name in sqla_models_names
for model in sqla_models:
print('created model: `%s`' % model)
except Exception as e:
print('could not create model: `%s`, %s' % (model, e))
......@@ -3,9 +3,10 @@
## Node
The table (nodes) is a list of nodes: [Node]
The table (nodes) is a list of nodes: `[Node]`
Each Node has:
- a typename
- a parent_id
- a name
......@@ -13,63 +14,90 @@ Each Node has:
### Each Node has a parent_id
Node A
├── Node B
└── Node C
Node A
├── Node B
└── Node C
If Node A is Parent of Node B and Node C
then == NodeB.parent_id == NodeC.parent_id.
### Each Node has a typename
Notation: Node[foo](bar) is a Node of typename "foo" and with name "bar".
Notation: `Node["FOO"]("bar")` is a Node of typename "FOO" and with name "bar".
- Then Node[project] is a project.
- Then Node[corpus] is a corpus.
- Then Node[document] is a document.
- Then Node[PROJECT] is a project.
- Then Node[CORPUS] is a corpus.
- Then Node[DOCUMENT] is a document.
### Each Node as a typename and a parent
The syntax of the Node here do not follow exactly Python documentation
(for clarity and to begin with): in Python code, typenames are strings
represented as UPPERCASE strings (eg. "PROJECT").
### Each Node as a typename and a parent
├── Node[project](myProject1)
│   ├── Node[corpus](myCorpus1)
│   ├── Node[corpus](myCorpus2)
│   └── Node[corpus](myCorpus3)
└── Node[project](myProject2)
├── Node[PROJECT](myProject1)
│   ├── Node[CORPUS](myCorpus1)
│   ├── Node[CORPUS](myCorpus2)
│   └── Node[CORPUS](myCorpus3)
└── Node[PROJECT](myProject2)
/!\\ 3 ways to manage rights of the Node:
/!\ 3 way to manage rights of the Node:
1) Then Node[User] is a folder containing all User projects and corpus and
documents (i.e. Node[user] is the parent_id of the children).
2) Each node as a user_id (mainly used today)
3) Right management for the groups (implemented already but not
used since not connected to the frontend).
1. Then Node[User] is a folder containing all User projects and corpus and
documents (i.e. Node[user] is the parent_id of the children).
2. Each node as a user_id (mainly used today)
3. Right management for the groups (implemented already but not
used since not connected to the frontend).
## Global Parameters
Global User is Gargantua (Node with typename User).
This node is the parent of the others Nodes for parameters.
Global User is Gargantua (Node with typename user).
This node is the parent of the other nodes for parameters.
Node[USER](gargantua) ( == Node[USER].user_id)
├── Node[TFIDF-Global](global) : without group
│   ├── Node[TFIDF](database1)
│   ├── Node[TFIDF](database2)
│   └── Node[TFIDF](database3)
└── Node[ANOTHERMETRIC](global)
[//]: # (Are there any plans to add user wide or project wide parameters or metrics? For example TFIDF nodes related to a normal user -- ie. not Gargantua?)
Yes we can in the future (but we have others priorities before.
[//]: # (What is the purpose of the 3 child nodes of Node[TFIDF-Global]? Are they TFIDF metrics related to databases 1, 2 and 3? If so, shouldn't they be children of related CORPUS nodes?)
Node placement in the tree indicates the context of the metric: the
Metrics Node has parent the corpus Node to indicate the context of the
Node[user](gargantua) ( == Node[user].user_id)
├── Node[TFIDF-Global](global) : without group
│   ├── Node[tfidf](database1)
│   ├── Node[tfidf](database2)
│   └── Node[tfidf](database2)
└── Node[anotherMetric](global)
├── Node[PROJECT](project1)
│   ├── Node[CORPUS](corpus1)
│   │   ├── Node[DOCUMENT](doc1)
│   │   ├── Node[DOCUMENT](doc2)
│   │ └── Node[TFIDF-global](name of the metrics)
│   ├── Node[CORPUS](corpus2)
│   └── Node[CORPUS](corpus3)
└── Node[PROJECT](project2)
## NodeNgram
NodeNgram is a relation of a Node with a ngram:
- document and ngrams
- metrics and ngrams (position of the node metrics indicates the
- documents and ngrams
- metrics and ngrams (position of the node metrics indicates the
* Create user gargantua
Main user of Gargantext is Gargantua (role of Pantagruel soon)!
``` bash
sudo adduser --disabled-password --gecos "" gargantua
* Create the directories you need
here for the example gargantext package will be installed in /srv/
``` bash
for dir in "/srv/gargantext"
"/srv/env_3-5"; do
sudo mkdir -p $dir ;
sudo chown gargantua:gargantua $dir ;
You should see:
$tree /srv
├── gargantext
├── gargantext_lib
├── gargantext_media
│   └── srv
│   └── env_3-5
└── gargantext_static
* Get the main libraries
Download uncompress and make main user access to it.
PLease, Be patient due to the size of the packages libraries (27GO)
this step can be long....
``` bash
wget \
&& tar xvjf gargantext_lib.tar.bz2 -o /srv/gargantext_lib \
&& sudo chown -R gargantua:gargantua /srv/gargantext_lib \
&& echo "Libs installed"
* Get the source code of Gargantext
by cloning the repository of gargantext
``` bash
git clone ssh:// /srv/gargantext \
&& cd /srv/gargantext \
&& git fetch origin stable \
&& git checkout stable \
TODO(soon): git clone
* Install and configure the virtual environment
``` bash
cd /srv/
pip3 install virtualenv
virtualenv /srv/env_3-5 -p /usr/bin/python3.5
pip install -r /srv/gargantext/install
echo '/srv/gargantext' > /srv/env_3-5/lib/python3.5/site-packages/gargantext.pth
echo 'alias venv="source /srv/env_3-5/bin/activate"' >> ~/.bashrc
See the [next steps of installation procedure](
See the [next manual steps of installation procedure](
# Gargantext foundations
Collaborative platform for multi-scale text experiments
Embrace the past, update the present, forecast the future.
# Main Types of Entity definitions
Documentation valid for 3.0.\* versions of Gargantext.
## Nature of the entities
In Object programming language, it is objects.
In purely functional language, it is types.
## Project
A project is a list of corpora (a project may have duplicate corpora).
## Corpus
A corpus is a set of documents: duplicate documents are authorized but
not recommended for the methodology since it shows artificial repeated content in the corpus.
Then, in the document view, users may delete duplicates with a specific
## Document
A document is the main Entity of Textual Context (ETC) that is composed with:
- a title (truncated field name in the database)
- the date of publication
- a journal (or source)
- an abstract
- the authors
Users may add many fields to the document.
The main fields mentioned above are used for the main statistics in Gargantext.
### Source Type
Source Type is the source (database) from where documents have been
In 3.0.\* versions of Gargantext, each corpus has only one source type
(i.e database). But user can build his own corpus with CVS format.
## Ngrams
### Definitions
### Gram
A gram is a contiguous sequence of letters separated by spaces.
### N-gram
N-gram is a contiguous sequence of n grams separated by spaces (where n
is a non negative natural number).
## N-gram Lists
## Main ngrams lists: Stop/Map/Main
Receipe of Gargantext consist of offering the rights ngrams for the map.
A the better level of complexity in order to unveil its richness
according to this 2 main rules:
If ngrams are too specifics, then the graph becomes too sparse.
If ngrams are too generics, then the graph becomes too connected.
As a consequence, finding the right balance of specific and generic
ngrams is the main target.
In first versions of Gargantext, this balance is solved with linear
methods. After 3.1.\*, non linear methods trained on dataset of the
users enable the system to find a better balance at any scale.
### Definition
3 main kinds of lists :
1. Stop List contains black listed ngrams i.e. the noise or in others words ngrams users do not want to deal with.
2. Map List contains ngrams that will be shown in the map.
3. Main list or Candidate list contains all other ngrams that are neither in the stop list or in the map list. Then it _could_ be in the map according to the choice of the user or, by default, the default parameters of Gargantext.
### Storage
Relation between the list and the ngram is stored as Node-Ngram
relation where
- Node has type name (STOP|MAIN|MAP) and parent_id the context
(CORPUS in version 3.0.*; but could be PROJECT)
- Ngrams depend on the context of the Node List where NodeNgrams is
not null and Node has typename Document.
├── Node[PROJECT](project1)
│   ├── Node[CORPUS](corpus1)
│   │   ├── Node[MAPLIST](list name)
│   │   ├── Node[STOPLIST](list name)
│   │   ├── Node[MAINLIST](list name)
│ │  │  
│   │   ├── Node[DOCUMENT](doc1)
│   │   ├── Node[DOCUMENT](doc2)
│   │ └── Node[DOCUMENT](doc2)
### Policy
#### Algo
Let be a set of ngrams where NodeNgram != 0 then
find 2 subsets of these ngrams that show a split
- stop ngrams
- not stop ngrams
then for the subset "not stop ngrams"
find 2 subset of ngrams that show a split:
- map ngrams
- others ngrams
#### Techno algo
A classifier (Support Machine Vector) is used on the following scaled-measures
for each step:
- n (of the "n" gram)
- Occurrences : Zip Law (in fact already used in TFICF, this
features are correletad, put here for pedagogical purpose)
- Genericity score
- Specificty score
## Metrics
### Term Frequency - Inverse Context Frequency (TF-ICF)
TFICF, short for term frequency-inverse context frequency, is a numerical
statistic that is intended to reflect how important an ngram is to a
context of text.
TFICF(ngram,contextLocal,contextGlobal) = TF(ngram,contextLocal) \* ICF(ngram, contextGlobal)
* TF(ngram, contextLocal) is the ngram frequency (occurrences) in contextLocal.
* ICF(ngram, contextGlobal) is the inverse (log) document frequency (occurrences) in contextGlobal.
Others types of TFICF:
If the context is a document in a set of documents (corpus), then it is a TFIDF as usual.
TFICF is the generalization of [TFIDF, Term Frequency - Inverse Document Frequency](
#### Implementation
TFICF = TF * log (ICF)
To prepare the groups, we need to store TF and ICF seperately (in
NodesNogram via 2 nodes).
Let be TF and ICF typename of Nodes.
├── Node[OCCURRENCES](source)
├── Node[TF](all sourcetype)
├── Node[ICF](all sourcetype)
├── Node[SOURCETYPE](Pubmed)
│   ├── Node[OCCURRENCES](all corpora)
│   ├── Node[TF](all corpora)
│   └── Node[ICF](all corpora)
## others ngrams lists
### Group List
#### Definition
Group list gives a quantifiable link between two ngrams.
#### Policy to build group lists
To group the ngrams:
- stemming or lemming
- c-value
- clustering (see graphs)
- manually by the user (supervised learning)
The scale is the character.
#### Storage
In the table NodeNgramNgram where Node has type name Group for ngram1
and ngram2.
### Favorite List
#### Definition
Fovorite Nodes
The scale is the node.
#### Building policy
- manually by the user (supervised learning)
#### Storage
NodeNode relation where first Node has type Favorite.
List of garg's own JSON API(s) urls
### /api/nodes/2
"id": 2,
"parent_id": 1,
"name": "abstract:\"evaporation+loss\"",
"typename": "CORPUS"
### /api/nodes?pagination_limit=-1
"records": [
"id": 9,
"parent_id": 2,
"name": "A recording evaporimeter",
"typename": "DOCUMENT"
"id": 119,
"parent_id": 81,
"name": "GRAPH EXPLORER COOC (in:81)",
"typename": "COOCCURRENCES"
"count": 119,
"parameters": {
"formated": "json","pagination_limit": -1,
"fields": ["id","parent_id","name","typename"],
"pagination_offset": 0
### /api/nodes?types[]=CORPUS
"records": [
"id": 2,
"parent_id": 1,
"name": "abstract:\"evaporation+loss\"",
"typename": "CORPUS"
"id": 8181,
"parent_id": 1,
"name": "abstract:(astrogeology+OR ((space OR spatial) AND planetary) AND geology)",
"typename": "CORPUS"
"count": 2,
"parameters": {
"pagination_limit": 10,
"types": ["CORPUS"],
"formated": "json",
"pagination_offset": 0,
"fields": ["id","parent_id","name","typename"]
### /api/nodes/5?fields[]=ngrams
<5> représente un doc_id ou list_id
"ngrams": [
[1.0,{"id":1755,"n":2,"terms":higher speeds}],
[1.0,{"id":2221,"n":3,"terms":other synthesized materials}],
[9.0,{"id":1754,"n":2,"terms":spherical gauges}],
[1.0,{"id":1981,"n":2,"terms":"wind effects"}]
### api/nodes/3?fields[]=id&fields[]=hyperdata&fields[]=typename
"id": 3,
"typename": "DOCUMENT",
"hyperdata": {
"language_name": "English",
"language_iso3": "eng",
"language_iso2": "en",
"title": "A blabla analysis of laser treated aluminium blablabla",
"name": "A blabla analysis of laser treated aluminium blablabla",
"authors": "A K. Jain, V.N. Kulkarni, D.K. Sood"
"authorsRAW": [
{"name": "....", "affiliations": ["... Research Centre,.. 085, Country"]},
{"name": "....", "affiliations": ["... Research Centre,.. 086, Country"]}
"abstract": "Laser processing of materials, being a rapid melt quenching process, quite often produces a surface which is far from being ideally smooth for ion beam analysis. (...)",
"genre": ["research-article"],
"doi": "10.1016/0029-554X(81)90998-8",
"journal": "Nuclear Instruments and Methods In Physics Research",
"publication_year": "1981",
"publication_date": "1981-01-01 00:00:00",
"publication_month": "01",
"publication_day": "01",
"publication_hour": "00",
"publication_minute": "00",
"publication_second": "00",
"id": "61076EB1178A97939B1C893904C77FB7DA2276D0",
"source": "elsevier",
"distributor": "istex"
## TODO continuer la liste
// dot -Tpng -o ngram_parsing_flow.png
digraph ngramflow {
edge [fontsize=10] ;
label=<<B><U>gargantext.util.toolchain</U></B><BR/>(ngram extraction flow)>;
labelloc="t" ;
"extracted_ngrams" -> "grouplist" ;
"extracted_ngrams" -> "occs+ti_rank" ;
"project stoplist (todo)" -> "stoplist" ;
"stoplist" -> "mainlist" ;
"occs+ti_rank" -> "mainlist" [label=" TI_RANK_LIMIT"];
"mainlist" -> "coocs" [label=" COOCS_THRESHOLD"] ;
"coocs" -> "specificity" ;
"specificity" -> "maplist" [label="MAPLIST_LIMIT\nMONOGRAM_PART"];
"mainlist" -> "tfidf" ;
"tfidf" -> "explore" [label="doc relations with all map and candidates"];
"maplist" -> "explore" ;
"grouplist" -> "occs+ti_rank" ;
"grouplist" -> "coocs" ;
"grouplist" -> "tfidf" ;
#Contribution guide
## Community
* [](
* IRC Chat: (OFTC/FreeNode) #gargantex
* gogs
* server access
* forge
* gargantext box
* Gargantex box install
(S.I.R.= Setup Install & Run procedures)
* Architecture Overview
* Database Schema Overview
* Interface design Overview
##To do:
* Docs
* Interface deisgn
* Parsers/scrapers
* Computing
## How to contribute:
1. Clone the repo
2. Create a new branch <username>-refactoring
3. Run the gargantext-box
4. Code
6. Commit
### Exemple1: Adding a parser
* create your new file into gargantex/scrapers/
* reference into gargantex/scrapers/
add this line:
import as cern
* reference into gargantext/constants
# type 9
{ 'name': 'Cern',
'parser': CernParser,
'default_language': 'en',
* add an APIKEY in gargantex/settings
### Exemple2: User Interface Design
Cycle de vie des décomptes ngrammes
### (schéma actuel et pistes) ###
Dans ce qui crée les décomptes, on peut distinguer deux niveaux ou étapes:
1. l'extraction initiale et le stockage du poids de la relation ngramme
document (appelons ces nodes "1doc")
2. tout le reste: la préparation des décomptes agrégés pour la table
termes ("stats"), et pour les tables de travail des graphes et de la
recherche de publications.
On pourrait peut-être parler d'indexation par docs pour le niveau 1 et de "modélisations" pour le niveau 2.
On peut remarquer que le niveau 1 concerne des **formes** ou ngrammes seuls (la forme observée <=> chaine de caractères u-nique après normalisation) tandis que dans le niveau 2 on a des objets plus riches... Au fur et à mesure des traitements on a finalement toujours des ngrammes mais:
- filtrés (on ne calcule pas tout sur tout)
- typés avec les listes map, stop, main (et peut-être bientôt des
"ownlistes" utilisateur)...
- groupés (ce qu'on voit avec le `+` de la table terme, et qu'on
pourrait peut-être faire apparaître aussi côté graphe?)
On peut dire qu'on manipule plutôt des **termes** au niveau 2 et non plus des **formes**... ils sont toujours des ngrammes mais enrichis par l'inclusion dans une série de mini modèles (agrégations et typologie de ngrammes guidée par les usages).
### Tables en BDD
Si on adopte cette distinction entre formes et termes, ça permet de clarifier à quel moment on doit mettre à jour ce qu'on a dans les tables. Côté structure de données, les décomptes sont toujours stockés via des n-uplets qu'on peut du coup résumer comme cela:
- **1doc**: (doc:node - forme:ngr - poids:float) dans des tables
- **occs/gen/spec/tirank**: (type_mesure:node - terme:ngr -
poids:float) dans des tables NodeNgram
- **cooc**: (type_graphe:node - terme1:ngr - terme2:ngr -
poids:float) dans des tables NodeNgramNgram
- **tfidf**: (type_lienspublis:node - doc:node - terme:ngr -
correlation:float) dans des tables NodeNodeNgram.
Où "type" est le node portant la nature de la stat obtenue, ou bien la
ref du graphe pour cooc et de l'index lié à la recherche de publis pour
le tfidf.
Il y a aussi les relations qui ne contiennent pas de décomptes mais sont
essentielles pour former les décomptes des autres:
- map/main/stopliste: (type_liste:node - forme ou terme:ngr) dans des
tables NodeNgram
- "groupes": (mainform:ngr - subform:ngr) dans des tables
### Scénarios d'actualisation
Alors, dans le déroulé des "scénarios utilisateurs", il y plusieurs
évenements qui viennent **modifier ces décomptes**:
1. les créations de termes opérés par l'utilisateur (ex: par
sélection/ajout dans la vue annotation)
2. les imports de termes correspondant à des formes jamais indexées sur
ce corpus
3. les dégroupements de termes opérés par l'utilisateur
4. le passage d'un terme de la stopliste aux autres listes
5. tout autre changement de listes et/ou création de nouveaux
A et B sont les deux seules étapes hormis l'extraction initiale où des
formes sont rajoutées. Actuellement A et B sont gérés tout de suite pour
le niveau 1 (tables par doc) : il me semble qu'il est bon d'opérer la
ré-indexation des 1doc le plus tôt possible après A ou B. Pour la vue
annotations, l'utilisateur s'attend à voir apparaître le surlignage
immédiatement sur le doc visualisé. Pour l'import B, c'est pratique car
on a la liste des nouveaux termes sous la main, ça évite de la stocker
quelque part en attendant un recalcul ultérieur.
L'autre info mise à jour tout de suite pour A et B est l'appartenance
aux listes et aux groupes (pour B), qui ne demandent aucun calcul.
C, D et E n'affectent pas le niveau 1 (tables par docs) car ils ne
rajoutent pas de formes nouvelles, mais constituent des modifications
sur les listes et les groupes, et devront donc provoquer une
modification du tfidf (pour cela on doit passer par un re-calcul) et des
coocs sur map (effet appliqué à la demande d'un nouveau graphe).
C et D demandent aussi une mise à jour des stats par termes
(occurrences, gen/spec etc) puisque les éléments subforms et les
éléments de la stopliste ne figurent pas dans les stats.
Donc pour résumer on a dans tous les cas:
=> l'ajout à une liste, à un groupe et tout éventuel décompte de
nouvelle forme dans les docs sont gérés dès l'action utilisateur
=> mais les modélisations plus "avancées" représentées par les les
stats occs, gen, spec et les tables de travail "coocs sur map" et
"tfidf" doivent attendre un recalcul.
Idéalement à l'avenir il seraient tous mis à jour incrémentalement au
lieu de forcer ce recalcul... mais pour l'instant on en est là.
### Fonctions associées
| | GUI | API action → url | VIEW | SUBROUTINES |
| A | "annotations/highlight.js, annotations/ngramlists.js" | "PUT → api/ngrams, PUT/DEL → api/ngramlists/change" | "ApiNgrams, ListChange" | util.toolchain.ngrams_addition.index_new_ngrams |
| B | NGrams_dyna_chart_and_table | POST/PATCH → api/ngramlists/import | CSVLists | "util.ngramlists_tools.import_ngramlists, util.ngramlists_tools.merge_ngramlists, util.toolchain.ngrams_addition.index_new_ngrams" |
| C,D,E | NGrams_dyna_chart_and_table | "PUT/DEL → api/ngramlists/change, PUT/DEL → api/ngramlists/groups" "ListChange, GroupChange" | util.toolchain.ngrams_addition.index_new_ngrams | |
L'import B a été remis en route il y a quelques semaines, et je viens de
reconnecter A dans la vue annotations.
cf. "ng-resource".
# Projects
## Overview of all projects
- re-implement deletion
## Single project view
- re-implement deletion
# Taggers
Path for data used by taggers should be defined in `gargantext.constants`.
# Database
# Sharing
Here follows a brief description of how sharing could be implemented.
## Database representation
The database representation of sharing can be distributed among 4 tables:
- `persons`, of which items represent either a user or a group
- `relationships` describes the relationships between persons (affiliation
of a user to a group, contact between two users, etc.)
- `nodes` contains the projects, corpora, documents, etc. to share (they shall
inherit the sharing properties from their parents)
- `permissions` stores the relations existing between the three previously
described above: it only consists of 2 foreign keys, plus an integer
between 1 and 3 representing the level of sharing and the start date
(when the sharing has been set) and the end date (when necessary, the time
at which sharing has been removed, `NULL` otherwise)
## Python code
The permission levels should be set in `gargantext.constants`, and defined as:
PERMISSION_NONE = 0 # 0b0000
PERMISSION_READ = 1 # 0b0001
The requests to check for permissions (or add new ones) should not be rewritten
every time. They should be "hidden" within the models:
- `Person.owns(node)` returns a boolean
- `Person.can_read(node)` returns a boolean
- `Person.can_write(node)` returns a boolean
- `Person.give_right(node, permission)` gives a right to a given user
- `Person.remove_right(node, permission)` removes a right from a given user
- `Person.get_nodes(permission[, type])` returns an iterator on the list of
nodes on which the person has at least the given permission (optional
argument: type of requested node)
- `Node.get_persons(permission[, type])` returns an iterator on the list of
users who have at least the given permission on the node (optional argument:
type of requested persons, such as `USER` or `GROUP`)
## Example
Let's imagine the `persons` table contains the following data:
| id | type | username |
| 1 | USER | David |
| 2 | GROUP | C.N.R.S. |
| 3 | USER | Alexandre |
| 4 | USER | Untel |
| 5 | GROUP | I.S.C. |
| 6 | USER | Bidule |
Assume "David" owns the groups "C.N.R.S." and "I.S.C.", "Alexandre" belongs to
the group "I.S.C.", with "Untel" and "Bidule" belonging to the group "C.N.R.S.".
"Alexandre" and "David" are in contact.
The `relationships` table then contains:
| person1_id | person2_id | type |
| 1 | 2 | OWNER |
| 1 | 5 | OWNER |
| 3 | 2 | MEMBER |
| 4 | 5 | MEMBER |
| 6 | 5 | MEMBER |
| 1 | 3 | CONTACT |
The `nodes` table is populated as such:
| id | type | name |
| 12 | PROJECT | My super project |
| 13 | CORPUS | A given corpus |
| 13 | CORPUS | The corpus |
| 14 | DOCUMENT | Some document |
| 15 | DOCUMENT | Another document |
| 16 | DOCUMENT | Yet another document |
| 17 | DOCUMENT | Last document |
| 18 | PROJECT | Another project |
| 19 | PROJECT | That project |
If we want to express that "David" created "My super project" (and its children)
and wants everyone in "C.N.R.S." to be able to view it, but not access it,
`permissions` should contain:
| person_id | node_id | permission |
| 1 | 12 | OWNER |
| 2 | 12 | READ |
If "David" also wanted "Alexandre" (and no one else) to view and modify "The
corpus" (and its children), we would have:
| person_id | node_id | permission |
| 1 | 12 | OWNER |
| 2 | 12 | READ |
| 3 | 13 | WRITE |
If "Alexandre" created "That project" and wants "Bidule" (and no one else) to be
able to view and modify it (and its children), the table should then have:
| person_id | node_id | permission |
| 3 | 19 | OWNER |
| 6 | 19 | WRITE |
#User guide
1. Login
run the gargantex box following the install procedure
open a webrowser at
click on Test Gargantext
login with:
Login : gargantua
Password : autnagrag
2. Create a project
3. Import an existing corpus
4. Create corpus from search
5. Explore stats
6. Explore graphs
7. Query
8. Refine
* Time periods
* Nodes
9. Export
#Architecture Overview
#Database Schema
Gargantext is a web plateform to explore your corpora using text-mining[...](
## Getting started
* [Install]( the Gargantext box
* [Take a tour]( of the different features offered by Gargantext
##Need some help?
Ask the community at:
* [](
* IRC Chat: (OFTC/FreeNode) #gargantex
##Want to contribute?
* take a look at the [architecture overview](
* read the [contribution guide](
## News
## Credits and acknowledgments
#Install Instructions for Gargamelle:
Gargamelle is the gargantext plateforme toolbox it is a full plateform system
with minimal modules
First you need to get the source code to install it
The folder will be /srv/gargantext:
* docs containes all informations on gargantext
* install contains all the installation files
Help needed ?
See []( and [tools](./ for the community
## Get the source code
by cloning gargantext into /srv/gargantext
``` bash
git clone ssh:// /srv/gargantext \
&& cd /srv/gargantext \
&& git fetch origin stable \
&& git checkout stable \
## Install
# go into the directory
user@computer: cd /srv/gargantext/
#git inside installation folder
user@computer: cd /install
#execute the installation
user@computer: ./install
The installation requires to create a user for gargantext, it will be asked:
Username (leave blank to use 'gargantua'):
#email is not mandatory
Email address:
Password (again):
If successfully done this step you should see:
Superuser created successfully.
[ ok ] Stopping PostgreSQL 9.5 database server: main.
## Run
Once you proceed to installation Gargantext plateforme will be available at localhost:8000
to start gargantext plateform:
``` bash
# go into the directory
user@computer: cd /srv/gargantext/
#git inside installation folder
user@computer: ./start
#type ctrl+d to exit or simply type exit in terminal;
Then open up a chromium browser and go to localhost:8000
Click on "Enter Gargantext"
Login in with you created username and pasword
Enjoy! ;)
* Create user gargantua
Main user of Gargantext is Gargantua (role of Pantagruel soon)!
``` bash
sudo adduser --disabled-password --gecos "" gargantua
* Create the directories you need
here for the example gargantext package will be installed in /srv/
``` bash
for dir in "/srv/gargantext"
"/srv/env_3-5"; do
sudo mkdir -p $dir ;
sudo chown gargantua:gargantua $dir ;
You should see:
$tree /srv
├── gargantext
├── gargantext_lib
├── gargantext_media
│   └── srv
│   └── env_3-5
└── gargantext_static
* Get the main libraries
Download uncompress and make main user access to it.
PLease, Be patient due to the size of the packages libraries (27GO)
this step can be long....
``` bash
wget \
&& tar xvjf gargantext_lib.tar.bz2 -o /srv/gargantext_lib \
&& sudo chown -R gargantua:gargantua /srv/gargantext_lib \
&& echo "Libs installed"
* Get the source code of Gargantext
by cloning the repository of gargantext
``` bash
git clone ssh:// /srv/gargantext \
&& cd /srv/gargantext \
&& git fetch origin refactoring \
&& git checkout refactoring \
TODO(soon): git clone
See the [next steps of installation procedure](
#Architecture Overview
#Database Schema
# HOW TO: Reference a new webscrapper/API + parser
## Global scope
Three main mooves to do:
- develop and index parser
in gargantext.util.parsers
- developp and index a scrapper
in gargantext.moissonneurs
- adapt forms for a new source
in templates and views
## Reference parser into gargantext website
gargantext website is stored in gargantext/gargantext
### reference your new parser into
* import your parser l.125
from gargantext.util.parsers import \
EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser
The parser corresponds to the name of the parser referenced in gargantext/util/parser
here name is CernParser
* index your RESOURCETYPE
int RESOURCETYPES (l.145) **at the end of the list**
# type 10
{ "name": 'SCOAP (XML MARC21 Format)',
"parser": CernParser,
"default_language": "en",
A noter le nom ici est composé de l'API_name(SCOAP) + (GENERICFILETYPE FORMAT_XML Format)
La complexité du nommage correspond à trois choses:
* le nom de l'API (different de l'organisme de production)
* le type de format: XML
* la norme XML de ce format : MARC21 (cf. CernParser in gargantext/util/parser/ )
The default_langage corresponds to the default accepted lang that **should load** the default corresponding tagger
from gargantext.util.taggers import NltkTagger
TO DO: charger à la demander les types de taggers en fonction des langues et de l'install
TO DO: proposer un module pour télécharger des parsers supplémentaires
TO DO: provide install tagger module scripts inside lib
Les formats correspondent aux types de fichiers acceptées lors de l'envoi du fichier dans le formulaire de
parsing disponible dans `gargantext/view/pages/` et
exposé dans `/templates/pages/projects/project.html`
## reference your parser script
## add your parser script into folder gargantext/util/parser/
here my filename was
##declare it into gargantext/util/parser/
from .Cern import CernParser
At this step, you will be able to see your parser and add a file with the form
but nothing will occur
## the good way to write the scrapper script
Three main and only requirements:
* your parser class should inherit from the base class _Parser()
* your parser class must have a parse method that take a **file buffer** as input
* you parser must structure and store data into **hyperdata_list** variable name
to be properly indexed by toolchain
! Be careful of date format: provide a publication_date in a string format YYYY-mm-dd HH:MM:SS
# Adding a scrapper API to offer search option:
En cours
* Add pop up question Do you have a corpus
option search in /templates/pages/projects/project.html line 181
## Reference a scrapper (moissonneur) into gargantext
* adding accepted_formats in constants
* adding check_file routine in Form check ==> but should inherit from utils/
that also have implmented the size upload limit check
# Suggestion 4 next steps:
* XML parser MARC21 UNIMARC ...
* A project type is qualified by the first element add i.e:
the first element determine the type of corpus of all the corpora within the project
Adding a new source into Gargantext requires a previous declaration
of the source inside
{ "type":9, #give a unique type int
"name": 'SCOAP [XML]', #resource name as proposed into the add corpus FORM [generic format]
"parser": "CernParser", #name of the new parser class inside a file (set to None if not implemented)
"format": 'MARC21', #specific format
'file_formats':["zip","xml"],# accepted file format
"crawler": "CernCrawler", #name of the new crawler class inside a file (set to None if no Crawler implemented)
'default_languages': ['en', 'fr'], #supported defaut languages of the source
## adding a new parser
Once you declared your new parser inside
add your new crawler file into /srv/gargantext/utils/parsers/
following this naming convention:
* Filename must be in uppercase without the Crawler mention.
eg. MailParser =>
* Inside this file the Parser must be called following the exact typo declared as parser in
* Your new crawler shall inherit from baseclasse Parser and provide a parse(filebuffer) method
#!/usr/bin/python3 env
from ._Parser import Parser
class MailParser(Parser):
def parse(self, file):
## adding a new crawler
Once you declared your new parser inside
add your new crawler file into /srv/gargantext/utils/parsers/
following this naming convention:
* Filename must be in uppercase without the Crawler mention.
eg. MailCrawler =>
* Inside this file the Crawler must be called following the exact typo declared as crawler in
* Your new crawler shall inherit from baseclasse Crawler and provide three method:
* scan_results => ids
* sample = > yes/no
* fetch
#!/usr/bin/python3 env
from ._Crawler import Crawler
class MailCrawler(Crawler):
def scan_results(self, query):
self.ids = set()
def sample(self, results_nb):
def fetch(self, ids):
// dot -Tpng -o ngram_parsing_flow.png
digraph ngramflow {
edge [fontsize=10] ;
label=<<B><U>gargantext.util.toolchain</U></B><BR/>(ngram extraction flow)>;
labelloc="t" ;
"extracted_ngrams" -> "grouplist" ;
"extracted_ngrams" -> "occs+tfidfs" ;
"main_user_stoplist" -> "stoplist" ;
"stoplist" -> "mainlist" ;
"occs+tfidfs" -> "mainlist" [label=" TFIDF_LIMIT"];
"mainlist" -> "coocs" [label=" COOCS_THRESHOLD"] ;
"coocs" -> "specificity" ;
"specificity" -> "maplist" [label="MAPLIST_LIMIT\nMONOGRAM_PART"];
"maplist" -> "explore" ;
"grouplist" -> "maplist" ;
from .celery import app as celery_app
Setup the Celery instance (see also gargantext/ that will be
used by all shared_task.
This is the recommended way:
import os
from celery import Celery
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
from django.conf import settings #noqa
app = Celery('gargantext')
app.autodiscover_tasks(lambda: settings.INSTALLED_APPS)
......@@ -59,25 +59,25 @@ LISTTYPES = {
# TODO separate id not array index, read by models.node
None, # 0
# documents hierarchy
# node/file hierarchy
'USER', # 1
'PROJECT', # 2
#RESOURCE should be here but last
'CORPUS', # 3
# lists
# lists of ngrams
'MAPLIST', # 8
# scores
# scores for ngrams
'CVALUE', # 12
# docs subset
# node subset
# more scores (sorry!)
from import BaseCommand, CommandError
from import tree_show, nodes
import colorama
class Command(BaseCommand):
help = 'Nodes'
def add_arguments(self, parser):
parser.add_argument(dest='action', default='show')
def handle(self, *args, **options):
action = options.get('action')
if action == 'show':
for root in nodes():
from import BaseCommand, CommandError
from gargantext.models import Node
class Command(BaseCommand):
help = 'Something'
def handle(self, *args, **options):
self.stdout.write('Oh yeah!'))
from .base import Base
from .nodes import *
from .hyperdata import *
from .users import *
from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index
from sqlalchemy.orm import relationship
from sqlalchemy.types import TypeDecorator, \
Integer, Float, Boolean, DateTime, String, Text
from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double
from sqlalchemy.ext.mutable import MutableDict, MutableList
from sqlalchemy.ext.declarative import declarative_base
__all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship",
"Integer", "Float", "Boolean", "DateTime", "String", "Text",
"JSONB", "Double",
"MutableDict", "MutableList",
"Base", "DjangoBase"]
# All the models should derive from this base class, so Base.metadata keeps
# all tables handled by Alembic migration scripts.
Base = declarative_base()
# To be used by tables already handled by Django ORM, such as User model. We
# separate them in order to keep those out of Alembic sight.
DjangoBase = declarative_base()
from gargantext.util.db import *
from gargantext.constants import INDEXED_HYPERDATA
from .base import Base, Column, ForeignKey, TypeDecorator, Index, \
Integer, Double, DateTime, String, Text
from .nodes import Node
import datetime
......@@ -64,6 +65,14 @@ class NodeHyperdata(Base):
__tablename__ = 'nodes_hyperdata'
__table_args__ = (
Index('nodes_hyperdata_node_id_value_utc_idx', 'node_id', 'value_utc'),
Index('nodes_hyperdata_node_id_key_value_utc_idx', 'node_id', 'key', 'value_utc'),
Index('nodes_hyperdata_node_id_key_value_str_idx', 'node_id', 'key', 'value_str'),
Index('nodes_hyperdata_node_id_key_value_int_idx', 'node_id', 'key', 'value_int'),
Index('nodes_hyperdata_node_id_key_value_flt_idx', 'node_id', 'key', 'value_flt'),
Index('nodes_hyperdata_node_id_key_idx', 'node_id', 'key'))
id = Column( Integer, primary_key=True )
node_id = Column( Integer, ForeignKey(, ondelete='CASCADE'))
key = Column( HyperdataKey )
from gargantext.util.db import *
from gargantext.util.files import upload
from gargantext.constants import *
from gargantext.util.toolchain.main import parse_extract_indexhyperdata
from datetime import datetime
from .users import User
from .nodes import Node
nodes_tree :: Int -> Tree Nodes
for project_id, project_name in nodes_list(user_id, 'Project'):
print("* Project (%d, %s)" % (project_id, project_name))
for corpus_id, corpus_name in nodes_list(user_id, 'Corpus', parent_id=project_id):
count = nodes_list( user_id
, 'Document'
, parent_id=corpus_id
, count=True
if count > 1:
print("|__ %d %s" % ( corpus_id, corpus_name ))
print(" |___ %s docs" % count)
def copy_nodes(node_id, to_parent_id=None, enabled=['PROJECT', 'CORPUS', 'DOCUMENT']):
node = session.query(Node_v2).filter(
nodetype = session.query(NodeType_v2).filter( == node.type_id).first()
resource = (session.query(ResourceType)
.join(NodeResource, NodeResource.resource_id ==
.filter(NodeResource.node_id ==
nodetype_proj_id = session.query( == 'Project' ).first()
nodetype_corp_id = session.query( == 'Corpus' ).first()
nodetype_docu_id = session.query( == 'Document').first()
typename =
# Import a project:
# new_project = Node(
# user_id =,
# typename = 'PROJECT',
# name = name,
# )
# session.add(new_project)
# session.commit()
if typename in enabled:
parent_node = session.query(Node).filter(
if parent_node is not None:
corpus = parent_node.add_child(
name =,
typename = typename
corpus.hyperdata['languages'] = {'fr' : 100}
type = resourcetype(
type = resourcetype('Europress (French)')
print("%s copied" %
nodes = (session.query(Node_v2)
.filter(Node_v2.parent_id ==
.filter(Node_v2.type_id == nodetype_docu_id)
for n in nodes:
doc = corpus.add_child( name =
, typename = "DOCUMENT"
, hyperdata = n.hyperdata
# else:
# print("%d is None" % parent_id)
print('%s is not enabled' % typename)
from gargantext.util.db import *
from .base import Base, Column, ForeignKey, relationship, Index, \
Integer, Float, String
from .nodes import Node
__all__ = ['Ngram', 'NodeNgram', 'NodeNodeNgram', 'NodeNgramNgram']
......@@ -7,17 +7,39 @@ __all__ = ['Ngram', 'NodeNgram', 'NodeNodeNgram', 'NodeNgramNgram']
class Ngram(Base):
__tablename__ = 'ngrams'
__table_args__ = (
Index('ngrams_id_n_idx', 'id', 'n'),
Index('ngrams_n_idx', 'n'))
id = Column(Integer, primary_key=True)
terms = Column(String(255), unique=True)
n = Column(Integer)
def __str__(self):
return '<{0.terms}>#{0.n}'.format(self)
def __repr__(self):
return '<Ngram(id={}, terms={0.terms!r}, n={0.n})>'.format(self)
class NodeNgram(Base):
__tablename__ = 'nodes_ngrams'
__table_args__ = (
Index('nodes_ngrams_node_id_ngram_id_idx', 'node_id', 'ngram_id'),
Index('nodes_ngrams_node_id_idx', 'node_id'),
Index('nodes_ngrams_ngram_id_idx', 'ngram_id'))
node_id = Column(Integer, ForeignKey(, ondelete='CASCADE'), primary_key=True)
ngram_id = Column(Integer, ForeignKey(, ondelete='CASCADE'), primary_key=True)
weight = Column(Float)
node = relationship(Node)
ngram = relationship(Ngram)
def __repr__(self):
return '<NodeNgram(node_id={0.node_id}, ngram={0.ngram}, weight={0.weight})>'.format(self)
class NodeNodeNgram(Base):
""" for instance for TFIDF
......@@ -28,6 +50,10 @@ class NodeNodeNgram(Base):
__tablename__ = 'nodes_nodes_ngrams'
__table_args__ = (
Index('nodes_nodes_ngrams_node2_id_idx', 'node2_id'),
Index('nodes_nodes_ngrams_node1_id_idx', 'node1_id'))
node1_id = Column(Integer, ForeignKey(, ondelete='CASCADE'), primary_key=True)
node2_id = Column(Integer, ForeignKey(, ondelete='CASCADE'), primary_key=True)
ngram_id = Column(Integer, ForeignKey(, ondelete='CASCADE'), primary_key=True)
......@@ -36,6 +62,14 @@ class NodeNodeNgram(Base):
# sinon par défaut on aurait un type sql "double_precision" (soit 15 chiffres)
# (cf.
node1 = relationship(Node, foreign_keys=[node1_id])
node2 = relationship(Node, foreign_keys=[node2_id])
ngram = relationship(Ngram)
def __repr__(self):
return '<NodeNodeNgram(node1_id={0.node1_id}, node2_id={0.node2_id}, ngram={0.ngram}, score={0.score})>'.format(self)
class NodeNgramNgram(Base):
""" for instance for COOCCURRENCES and GROUPLIST
......@@ -46,7 +80,20 @@ class NodeNgramNgram(Base):
__tablename__ = 'nodes_ngrams_ngrams'
__table_args__ = (
Index('nodes_ngrams_ngrams_node_id_ngram1_id_ngram2_id_idx', 'node_id', 'ngram1_id', 'ngram2_id'),
Index('nodes_ngrams_ngrams_node_id_idx', 'node_id'),
Index('nodes_ngrams_ngrams_ngram1_id_idx', 'ngram1_id'),
Index('nodes_ngrams_ngrams_ngram2_id_idx', 'ngram2_id'))
node_id = Column(Integer, ForeignKey(, ondelete='CASCADE'), primary_key=True)
ngram1_id = Column(Integer, ForeignKey(, ondelete='CASCADE'), primary_key=True)
ngram2_id = Column(Integer, ForeignKey(, ondelete='CASCADE'), primary_key=True)
weight = Column(Float(precision=24)) # see comment for NodeNodeNgram.score
node = relationship(Node)
ngram1 = relationship(Ngram, foreign_keys=[ngram1_id])
ngram2 = relationship(Ngram, foreign_keys=[ngram2_id])
def __repr__(self):
return '<NodeNgramNgram(node_id={0.node_id}, ngram1={0.ngram1}, ngram2={0.ngram2}, weight={0.weight})>'.format(self)
from gargantext.util.db import *
from gargantext.util.db import session
from gargantext.util.files import upload
from gargantext.constants import *
from sqlalchemy_utils.types import TSVectorType
from datetime import datetime
from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \
Integer, Float, String, DateTime, JSONB, \
MutableList, MutableDict
from .users import User
__all__ = ['Node', 'NodeNode']
__all__ = ['Node', 'NodeNode', 'CorpusNode']
class NodeType(TypeDecorator):
"""Define a new type of column to describe a Node's type.
......@@ -19,23 +24,69 @@ class NodeType(TypeDecorator):
def process_result_value(self, typeindex, dialect):
return NODETYPES[typeindex]
class Node(Base):
"""This model can fit many purposes.
"""This model can fit many purposes:
myFirstCorpus = session.query(CorpusNode).first()
It intends to provide a generic model, allowing hierarchical structure
and NoSQL-like data structuring.
The possible types are defined in `gargantext.constants.NODETYPES`.
Thanks to __new__ overriding and SQLAlchemy's polymorphism, every Node
instance is automagically casted to its sub-class, assuming a typename
is specified.
>>> Node(name='without-type')
<Node(id=None, typename=None, user_id=None, parent_id=None, name='without-type', date=None)>
>>> Node(typename='CORPUS')
<CorpusNode(id=None, typename='CORPUS', user_id=None, parent_id=None, name=None, date=None)>
>>> from gargantext.util.db import session
>>> session.query(Node).filter_by(typename='USER').first() # doctest: +ELLIPSIS
But beware, there are some caveats with bulk queries. In this case typename
MUST be specified manually.
>>> session.query(UserNode).delete() # doctest: +SKIP
# Wrong: all nodes are deleted!
>>> session.query(UserNode).filter_by(typename='USER').delete() # doctest: +SKIP
# Right: only user nodes are deleted.
__tablename__ = 'nodes'
__table_args__ = (
Index('nodes_user_id_typename_parent_id_idx', 'user_id', 'typename', 'parent_id'),
Index('nodes_hyperdata_idx', 'hyperdata'))
# create INDEX full_text_idx on nodes using gin(to_tsvector('english', hyperdata ->> 'abstract' || 'title'));
id = Column(Integer, primary_key=True)
typename = Column(NodeType, index=True)
__mapper_args__ = { 'polymorphic_on': typename }
# foreign keys
user_id = Column(Integer, ForeignKey(, ondelete='CASCADE'))
parent_id = Column(Integer, ForeignKey('', ondelete='CASCADE'))
# main data
user_id = Column(Integer, ForeignKey(, ondelete='CASCADE'))
user = relationship(User)
parent_id = Column(Integer, ForeignKey('', ondelete='CASCADE'))
parent = relationship('Node', remote_side=[id])
name = Column(String(255))
date = Column(DateTime(),
date = Column(DateTime(timezone=True),
hyperdata = Column(JSONB, default=dict)
# metadata (see
hyperdata = Column(JSONB, default=dict)
# To make search possible uncomment the line below
#search_vector = Column(TSVectorType('hyperdata'))
def __new__(cls, *args, **kwargs):
if cls is Node and kwargs.get('typename'):
typename = kwargs.pop('typename')
return _NODE_MODELS[typename](*args, **kwargs)
return super(Node, cls).__new__(cls)
def __init__(self, **kwargs):
"""Node's constructor.
......@@ -55,6 +106,11 @@ class Node(Base):
self.hyperdata[key] = value
def __repr__(self):
return '<{0.__class__.__name__}(id={}, typename={0.typename!r}, ' \
'user_id={0.user_id}, parent_id={0.parent_id}, ' \
'name={!r}, date={})>'.format(self)
def ngrams(self):
"""Pseudo-attribute allowing to retrieve a node's ngrams.
......@@ -120,36 +176,6 @@ class Node(Base):
def resources(self):
"""Return all the resources attached to a given node.
Mainly used for corpora.
[{'extracted': True,
'path': '/home/me/gargantext/uploads/corpora/0c/0c5b/0c5b50/0c5b50ad8ebdeb2ae33d8e54141a52ee_Corpus_Europresse-Franç',
'type': 1,
'url': None}]
if 'resources' not in self.hyperdata:
self['resources'] = MutableList()
return self['resources']
def add_resource(self, type, path=None, url=None):
"""Attach a resource to a given node.
Mainly used for corpora.
this just adds metadata to the CORPUS node (NOT for adding documents)
{'extracted': True,
'path': '/home/me/gargantext/uploads/corpora/0c/0c5b/0c5b50/0c5b50ad8ebdeb2ae33d8e54141a52ee_Corpus_Europresse-Franç',
'type': 1,
'url': None}
{'type': type, 'path':path, 'url':url, 'extracted': False}
def status(self, action=None, progress=0, complete=False, error=None):
"""Get or update the status of the given action.
If no action is given, the status of the first uncomplete or last item
......@@ -187,8 +213,86 @@ class Node(Base):
return self['statuses'][-1]
class CorpusNode(Node):
__mapper_args__ = {
'polymorphic_identity': 'CORPUS'
def resources(self):
"""Return all the resources attached to a given node.
[{'extracted': True,
'path': '/home/me/gargantext/uploads/corpora/0c/0c5b/0c5b50/0c5b50ad8ebdeb2ae33d8e54141a52ee_Corpus_Europresse-Franç',
'type': 1,
'url': None}]
if 'resources' not in self.hyperdata:
self['resources'] = MutableList()
return self['resources']
def add_resource(self, type, path=None, url=None):
"""Attach a resource to a given node.
this just adds metadata to the CORPUS node (NOT for adding documents)
{'extracted': True,
'path': '/home/me/gargantext/uploads/corpora/0c/0c5b/0c5b50/0c5b50ad8ebdeb2ae33d8e54141a52ee_Corpus_Europresse-Franç',
'type': 1,
'url': None}
{'type': type, 'path':path, 'url':url, 'extracted': False}
class NodeNode(Base):
__tablename__ = 'nodes_nodes'
__table_args__ = (
Index('nodes_nodes_node1_id_node2_id_idx', 'node1_id', 'node2_id'),)
node1_id = Column(Integer, ForeignKey(, ondelete='CASCADE'), primary_key=True)
node2_id = Column(Integer, ForeignKey(, ondelete='CASCADE'), primary_key=True)
score = Column(Float(precision=24))
node1 = relationship(Node, foreign_keys=[node1_id])
node2 = relationship(Node, foreign_keys=[node2_id])
def __repr__(self):
return '<NodeNode(node1_id={0.node1_id}, node2_id={0.node2_id}, score={0.score})>'.format(self)
# --8<-- Begin hack ------
# XXX Hack to automatically defines subclasses of Node for every NODETYPES,
# in order to avoid SQLAlchemy complaints -- and subsequent exceptions.
# We could manually write a class for every NodeType, or find a way to
# tell SQLAlchemy that it should stick to instantiate a Node when a
# class is not defined for the wanted typename.
set(cls.__mapper_args__.get('polymorphic_identity') for cls in Node.__subclasses__())
for nodetype in NODETYPES:
if nodetype and nodetype not in _ALREADY_IMPLEMENTED_NODE_TYPES:
# Convert nodetype to a CamelCase class name, assuming it's possible...
class_name = ''.join(nodetype.title().split("-")) + 'Node'
# Create new class and add it to global scope
globals()[class_name] = type(class_name, (Node,), {
"__mapper_args__": {
"polymorphic_identity": nodetype
# Add class to exports
# ------ End of hack ------
mapper.polymorphic_identity: mapper.class_
for mapper in Node.__mapper__.self_and_descendants
if mapper.class_ is not Node
from django.contrib.auth import models
from gargantext.util.db import *
from gargantext.util.db import session, aliased
from datetime import datetime
from .base import DjangoBase, Base, Column, ForeignKey, UniqueConstraint, \
Integer, Boolean, DateTime, String
__all__ = ['User', 'Contact']
class User(Base):
class User(DjangoBase):
# The properties below are a reflection of Django's auth module's models.
__tablename__ = models.User._meta.db_table
id = Column(Integer, primary_key=True)
......@@ -60,7 +63,7 @@ class User(Base):
"""check if a given node is owned by the user"""
return (node.user_id == or \ in ( for contact in self.contacts())
def get_params(self, username=None):
return self.hyperdata
"""Define ReplaceableObject and related operations
Implements operations to create/drop SQL objects such as views, stored
procedures and triggers that can't be "altered" but can be replaced -- hence
the name of "ReplaceableObject" class.
This recipe is directly borrowed from Alembic documentation, see
from alembic.operations import Operations, MigrateOperation
__all__ = ['ReplaceableObject']
class ReplaceableObject(object):
def __init__(self, name, sqltext): = name
self.sqltext = sqltext
class ReversibleOp(MigrateOperation):
def __init__(self, target): = target
def invoke_for_target(cls, operations, target):
op = cls(target)
return operations.invoke(op)
def reverse(self):
raise NotImplementedError()
def _get_object_from_version(cls, operations, ident):
version, objname = ident.split(".")
module = operations.get_context().script.get_revision(version).module
obj = getattr(module, objname)
return obj
def replace(cls, operations, target, replaces=None, replace_with=None):
if replaces:
old_obj = cls._get_object_from_version(operations, replaces)
drop_old = cls(old_obj).reverse()
create_new = cls(target)
elif replace_with:
old_obj = cls._get_object_from_version(operations, replace_with)
drop_old = cls(target).reverse()
create_new = cls(old_obj)
raise TypeError("replaces or replace_with is required")
@Operations.register_operation("create_view", "invoke_for_target")
@Operations.register_operation("replace_view", "replace")
class CreateViewOp(ReversibleOp):
def reverse(self):
return DropViewOp(
@Operations.register_operation("drop_view", "invoke_for_target")
class DropViewOp(ReversibleOp):
def reverse(self):
return CreateViewOp(self.view)
@Operations.register_operation("create_sp", "invoke_for_target")
@Operations.register_operation("replace_sp", "replace")
class CreateSPOp(ReversibleOp):
def reverse(self):
return DropSPOp(
@Operations.register_operation("drop_sp", "invoke_for_target")
class DropSPOp(ReversibleOp):
def reverse(self):
return CreateSPOp(
def create_view(operations, operation):
operations.execute("CREATE VIEW %s AS %s" % (,
def drop_view(operations, operation):
operations.execute("DROP VIEW %s" %
def create_sp(operations, operation):
"CREATE FUNCTION %s %s" % (,
def drop_sp(operations, operation):
operations.execute("DROP FUNCTION %s" %
# Make this a standalone script...
# Can be called this way: python3 gargantext/tools/
import os
import django
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
# ...End of jiberish.
import itertools
import colorama
from colorama import Fore
from sqlalchemy.sql.expression import literal_column
from gargantext.util.db import session, func, aliased
from gargantext.models import Node
FIRST = 0x01
LAST = 0x02
def nodes(parent=None, group_by='typename', order_by='typename', has_child='check'):
if group_by or has_child is not None:
select = [func.min('id'),
select = ['id'),'name'),
if has_child is not None:
N = aliased(Node)
parent_id = getattr(parent, 'id', parent)
q = session.query(*select).filter_by(parent_id=parent_id) \
.group_by(getattr(Node, group_by if group_by else 'id'))
if has_child is not None:
q = q.outerjoin(N, N.parent_id ==
return q.order_by(order_by)
def node_show(node, prefix='', maxlen=60):
if node.children > 0 or node.cnt == 1:
name =[:maxlen] + '..' if len( > maxlen else
label = Fore.CYAN + name + Fore.RESET
label = Fore.MAGENTA + str(node.cnt) + Fore.RESET
print(prefix, '%s%s %s' % (Fore.GREEN, node.typename, label), sep='')
def tree_show(node, pos=FIRST|LAST, level=0, prefix='', maxlen=60, compact=True):
#print('%02d %x' % (level, pos), end='')
branch = TREE_ROOT if pos&FIRST and level == 0 else TREE_FORK if not pos&LAST else TREE_CORN
node_prefix = prefix + branch + 2*TREE_HORI + ' '
node_show(node, node_prefix, maxlen)
childs = iter(nodes(parent=node, group_by=compact and 'typename'))
node = next(childs)
except StopIteration:
prefix = prefix + (' ' if pos&LAST else TREE_VERT) + ' '
for i, next_node in enumerate(itertools.chain(childs, [None])):
pos = (FIRST if i == 0 else 0) | (LAST if next_node is None else 0)
tree_show(node, pos, level + 1, prefix, maxlen, compact)
node = next_node
if __name__ == "__main__":
import sys
if len(sys.argv) == 1:
compact = True
elif len(sys.argv) == 2 and sys.argv[1] in ('-a', '--all'):
compact = False
print("Usage: %s [-a|--all]" % sys.argv[0], file=sys.stderr)
for root in nodes():
tree_show(root, compact=compact)
......@@ -45,6 +45,7 @@ class HalCrawler(Crawler):
, uri_s
, isbn_s
, issue_s
, docType_s
, journalPublisher_s
#, authUrl_s
......@@ -5,16 +5,15 @@ from gargantext.util.json import json_dumps
# get engine, session, etc.
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import delete
from sqlalchemy_searchable import make_searchable
def get_engine():
from sqlalchemy import create_engine
url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{NAME}'.format(
return create_engine( url
return create_engine( settings.DATABASES['default']['URL']
, use_native_hstore = True
, json_serializer = json_dumps
, pool_size=20, max_overflow=0
......@@ -22,20 +21,16 @@ def get_engine():
engine = get_engine()
# To make Full Text search possible, uncomment lines below
Base = declarative_base()
session = scoped_session(sessionmaker(bind=engine))
# tools to build models
from sqlalchemy.types import *
from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint
from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION
from sqlalchemy.ext.mutable import MutableDict, MutableList
# useful for queries
......@@ -10,7 +10,7 @@ __all__ = ['json_encoder', 'json_dumps']
class JSONEncoder(json.JSONEncoder):
def default(self, obj):
from gargantext.util.db import Base
from gargantext.models import Base
if isinstance(obj, Base):
return {
key: value
......@@ -12,15 +12,19 @@ from gargantext.constants import DEFAULT_MAPLIST_MAX,\
def do_maplist_query():
return None
def do_maplist(corpus,
overwrite_id = None,
mainlist_id = None,
specclusion_id = None,
genclusion_id = None,
grouplist_id = None,
overwrite_id = None,
mainlist_id = None,
specclusion_id = None,
genclusion_id = None,
grouplist_id = None,
According to Genericity/Specificity and mainlist
......@@ -28,9 +32,9 @@ def do_maplist(corpus,
- mainlist_id (starting point, already cleaned of stoplist terms)
- specclusion_id (ngram inclusion by cooc specificity -- ranking factor)
- genclusion_id (ngram inclusion by cooc genericity -- ranking factor)
- genclusion_id (ngram inclusion by cooc genericity -- ranking factor)
- grouplist_id (filtering grouped ones)
- overwrite_id: optional if preexisting MAPLIST node to overwrite
- overwrite_id: optional. Overwrite if preexisting MAPLIST node
+ 3 params to modulate the terms choice
- limit for the amount of picked terms
......@@ -77,6 +81,7 @@ def do_maplist(corpus,
.join(Ngram, == ScoreSpec.ngram_id)
.join(ScoreGen, ScoreGen.ngram_id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specclusion_id)
.filter(ScoreGen.node_id == genclusion_id)
......@@ -155,10 +160,10 @@ def do_maplist(corpus,
# at the end of the first loop we just need to sort all by the second ranker (gen)
scored_ngrams = sorted(scored_ngrams, key=lambda ng_infos: ng_infos[2], reverse=True)
obtained_spec_mono = len(chosen_ngrams['topspec']['monograms'])
obtained_spec_mono = len(chosen_ngrams['topspec']['monograms'])
obtained_spec_multi = len(chosen_ngrams['topspec']['multigrams'])
obtained_gen_mono = len(chosen_ngrams['topgen']['monograms'])
obtained_gen_multi = len(chosen_ngrams['topgen']['multigrams'])
obtained_gen_mono = len(chosen_ngrams['topgen']['monograms'])
obtained_gen_multi = len(chosen_ngrams['topgen']['multigrams'])
obtained_total = obtained_spec_mono \
+ obtained_spec_multi \
+ obtained_gen_mono \
......@@ -54,6 +54,7 @@ update-locale LC_ALL=fr_FR.UTF-8
cd /srv/
pip3 install virtualenv
virtualenv /srv/env_3-5 -p /usr/bin/python3.5
echo '/srv/gargantext' > /srv/env_3-5/lib/python3.5/site-packages/gargantext.pth
echo 'alias venv="source /srv/env_3-5/bin/activate"' >> ~/.bashrc
......@@ -15,9 +15,9 @@ RUN apt-get update && \
apt-utils ca-certificates locales \
sudo aptitude gcc g++ wget git vim \
build-essential make \
postgresql-9.5 postgresql-client-9.5 postgresql-contrib-9.5 \
postgresql-server-dev-9.5 libpq-dev libxml2 \
postgresql-9.5 postgresql-client-9.5 postgresql-contrib-9.5
postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6 \
postgresql-server-dev-9.6 libpq-dev libxml2 \
postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6
### Configure timezone and locale
......@@ -37,7 +37,7 @@ ENV LC_ALL fr_FR.UTF-8
### Install main dependencies and python packages based on Debian distrib
RUN echo "############# PYTHON DEPENDENCIES ###############"
RUN apt-get update && apt-get install -y \
libxml2-dev xml-core libgfortran-5-dev \
libxml2-dev xml-core libgfortran-6-dev \
libpq-dev \
python3.5 \
python3-dev \
......@@ -47,8 +47,8 @@ RUN apt-get update && apt-get install -y \
# python dependencies
python3-pip \
# for lxml
libxml2-dev libxslt-dev
#libxslt1-dev zlib1g-dev
libxml2-dev libxslt-dev \
libxslt1-dev zlib1g-dev
RUN apt-get update && apt-get autoclean &&\
......@@ -51,12 +51,12 @@ server {
# Add index.php to the list if you are using PHP
#index index.html index.htm index.nginx-debian.html;
server_name _ ;
server_name _ localhost ;
# Django media
location /media {
alias /var/www/gargantext/media; # your Django project's media files - amend as required
alias /srv/gargantext_media; # your Django project's media files - amend as required
location /static {
......@@ -72,23 +72,3 @@ server {
access_log /var/log/nginx/access.log;
error_log /var/log/nginx/error.log;
server {
listen 80 ;
listen [::]:80;
server_name ;
error_page 404 /index.html;
location / {
root /var/www/dl ;
proxy_set_header Host $host;
proxy_buffering off;
access_log /var/log/nginx/;
error_log /var/log/nginx/;
......@@ -33,3 +33,5 @@ lxml==3.5.0
# SQLAlchemy-Searchable==0.10.4
sudo docker rm $(sudo docker ps -a | grep sh | awk '{print $1}')
sudo docker build -t garg-notebook:latest ./notebook
#-v /srv/gargandata:/srv/gargandata \
#-v /srv/gargantext_lib:/srv/gargantext_lib \
sudo docker rm $(sudo docker ps -a | grep notebook | grep sh | awk '{print $1}')
sudo docker run \
--name=garg-notebook \
-v /srv/gargantext:/srv/gargantext \
-p 8899:8899 \
-it garg-notebook:latest \
/bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && jupyter notebook --port=8899 --ip= --no-browser --notebook-dir=/home/notebooks/'"
#/bin/bash -c "/bin/su gargantua -c 'source /env_3-5/bin/activate && jupyter notebook --port=8899 --ip= --no-browser'"
# Gargamelle WEB
#Build an image starting with debian:stretch image
# wich contains all the source code of the app
FROM debian:stretch
USER root
### Update and install base dependencies
RUN echo "############ DEBIAN LIBS ###############"
RUN apt-get update && \
apt-get install -y \
apt-utils ca-certificates locales \
sudo aptitude gcc g++ wget git vim \
build-essential make \
curl \
postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6 \
postgresql-server-dev-9.6 libpq-dev libxml2 \
postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6
# Install Stack
### Configure timezone and locale
RUN echo "########### LOCALES & TZ #################"
RUN echo "Europe/Paris" > /etc/timezone
ENV TZ "Europe/Paris"
RUN sed -i -e 's/# en_GB.UTF-8 UTF-8/en_GB.UTF-8 UTF-8/' /etc/locale.gen && \
sed -i -e 's/# fr_FR.UTF-8 UTF-8/fr_FR.UTF-8 UTF-8/' /etc/locale.gen && \
dpkg-reconfigure --frontend=noninteractive locales && \
echo 'LANG="fr_FR.UTF-8"' > /etc/default/locale
### Install main dependencies and python packages based on Debian distrib
RUN echo "############# PYTHON DEPENDENCIES ###############"
RUN apt-get update && apt-get install -y \
libxml2-dev xml-core libgfortran-6-dev \
libpq-dev \
python3.5 \
python3-dev \
# for numpy, pandas and numpyperf \
python3-six python3-numpy python3-setuptools \
python3-numexpr \
# python dependencies \
python3-pip \
# for lxml
libxml2-dev libxslt-dev libxslt1-dev zlib1g-dev
RUN apt-get update && apt-get autoclean \
&& rm -rf /var/lib/apt/lists/*
#NB: removing /var/lib will avoid to significantly fill up your /var/ folder on your native system
RUN adduser --disabled-password --gecos "" notebooks
RUN pip3 install virtualenv
RUN virtualenv /env_3-5
RUN echo 'alias venv="source /env_3-5/bin/activate"' >> ~/.bashrc
ADD requirements.txt /
RUN . /env_3-5/bin/activate && pip3 install -r requirements.txt && \
pip3 install git+ && \
python3 -m nltk.downloader averaged_perceptron_tagger -d /usr/local/share/nltk_data
#RUN ./
#RUN ./
RUN chown notebooks:notebooks -R /env_3-5
### Notebook IHaskell and IPYTHON ENVIRONNEMENT
RUN apt-get update && apt-get install -y \
libtinfo-dev \
libzmq3-dev \
libcairo2-dev \
libpango1.0-dev \
libmagic-dev \
libblas-dev \
RUN curl -sSL | sh
RUN stack setup
RUN git clone
RUN . /env_3-5/bin/activate \
&& cd IHaskell \
&& stack install gtk2hs-buildtools \
&& stack install --fast \
&& /root/.local/bin/ihaskell install --stack
#RUN sed -iP "s%^data_directory.*%data_directory = \'\/srv\/gargandata\'%" /etc/postgresql/9.5/main/postgresql.conf
#RUN echo "host all all md5" >> /etc/postgresql/9.5/main/pg_hba.conf
#RUN echo "listen_addresses='*'" >> /etc/postgresql/9.5/main/postgresql.conf
VOLUME ["/srv/","/home/notebooks/"]
# __| |(_) __ _ _ __ __ _ ___
# / _` || |/ _` | '_ \ / _` |/ _ \
# | (_| || | (_| | | | | (_| | (_) |
# \__,_|/ |\__,_|_| |_|\__, |\___/
# |__/ |___/
#configure django migrations
echo "::::: DJANGO :::::"
#echo "Starting Postgres"
#/usr/sbin/service postgresql start
su gargantua -c 'source /srv/env_3-5/bin/activate &&\
echo "Activated env" &&\
/srv/gargantext/ makemigrations &&\
/srv/gargantext/ migrate && \
echo "migrations ok" &&\
/srv/gargantext/ && \
/srv/gargantext/ && \
/srv/gargantext/ && \
/srv/gargantext/ createsuperuser'
service postgresql stop
Gargantext Software Copyright (c) 2016 CNRS ISC-PIF -
Licence (see : )
- In France : a CECILL variant affero compliant
- GNU aGPLV3 for all other countries
#!/usr/bin/env python
import sys
import os
# Django settings
dirname = os.path.dirname(os.path.realpath(__file__))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext.settings")
# initialize Django application
from django.core.wsgi import get_wsgi_application
application = get_wsgi_application()
from gargantext.util.toolchain.main import parse_extract_indexhyperdata
from gargantext.util.db import *
from gargantext.models import Node
from nltk.tokenize import wordpunct_tokenize
from gargantext.models import *
from nltk.tokenize import word_tokenize
import nltk as nltk
from statistics import mean
from math import log
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import datetime
from collections import Counter
from langdetect import detect as detect_lang
def documents(corpus_id):
return (session.query(Node).filter( Node.parent_id==corpus_id
, Node.typename=="DOCUMENT"
# .order_by(Node.hyperdata['publication_date'])
import seaborn as sns
import pandas as pd
def chart(docs, field):
year_publis = list(Counter([doc.hyperdata[field] for doc in docs]).items())
frame0 = pd.DataFrame(year_publis, columns=['Date', 'DateValue'])
frame1 = pd.DataFrame(year_publis, columns=['Date', 'DateValue'], index=frame0.Date)
return frame1
from gargantext.util.crawlers.HAL import HalCrawler
def scan_hal(request):
hal = HalCrawler()
return hal.scan_results(request)
def scan_gargantext(corpus_id, lang, request):
connection = get_engine().connect()
# TODO add some sugar the request (ideally request should be the same for hal and garg)
query = """select count( from nodes n
where to_tsvector('%s', hyperdata ->> 'abstract' || 'title')
@@ to_tsquery('%s')
AND n.parent_id = %s;""" % (lang, request, corpus_id)
return [i for i in connection.execute(query)][0][0]
## ____ _
## | _ \ ___ ___| |_ __ _ _ __ ___ ___
## | |_) / _ \/ __| __/ _` | '__/ _ \/ __|
## | __/ (_) \__ \ || (_| | | | __/\__ \
## |_| \___/|___/\__\__, |_| \___||___/
## |___/
echo "::::: POSTGRESQL :::::"
su postgres -c 'pg_dropcluster 9.4 main --stop'
#done in docker but redoing it
rm -rf /srv/gargandata && mkdir /srv/gargandata && chown postgres:postgres /srv/gargandata
su postgres -c '/usr/lib/postgresql/9.6/bin/initdb -D /srv/gargandata/'
su postgres -c '/usr/lib/postgresql/9.6/bin/pg_ctl -D /srv/gargandata/ -l /srv/gargandata/journal_applicatif start'
su postgres -c 'pg_createcluster -D /srv/gargandata 9.6 main '
su postgres -c 'pg_ctlcluster -D /srv/gargandata 9.6 main start '
su postgres -c 'pg_ctlcluster 9.6 main start'
service postgresql start
su postgres -c "psql -c \"CREATE user gargantua WITH PASSWORD 'C8kdcUrAQy66U'\""
su postgres -c "createdb -O gargantua gargandb"
echo "Postgres configured"
#service postgresql stop
# try bottleneck
kombu==3.0.37 # messaging
langdetect==1.0.6 #detectinglanguage
pytz==2016.10 # timezones
umalqurra==0.2 # arabic calendars (?? why use ??)
......@@ -367,7 +367,7 @@
<span class="glyphicon glyphicon-registration-mark" aria-hidden="true"></span>
, version,
, version 3.0.7,
<a href="" target="blank" title="Institution that enables this project.">
<span class="glyphicon glyphicon-copyright-mark" aria-hidden="true"></span>
......@@ -79,42 +79,39 @@
<div class="container">
<div id="status" class="row col-lg-12 collapse">
<div class="container">
<div id="status" class="row col-lg-12 collapse">
<div id="status-msg" class="alert">
<div class="row collapse" id="editor">
<button title="delete selected project" type="button" class="btn btn-danger" id="delete">
<span class="glyphicon glyphicon-trash " aria-hidden="true" ></span>
<button title="edit selected project" type="button" class="btn btn-warning" id="edit">
<span class="glyphicon glyphicon-pencil " aria-hidden="true" onclick="editProjects()"></span>
</button> -->
<!-- <button type="button" class="btn btn-info" id="recalculate">
<span class="glyphicon glyphicon-refresh " aria-hidden="true" onclick="recalculateProjects()"></span>
<div id="status-msg" class="alert">
<br />
<div class="row collapse" id="editor">
<button title="delete selected project" type="button" class="btn btn-danger" id="delete">
<span class="glyphicon glyphicon-trash " aria-hidden="true" ></span>
<button title="edit selected project" type="button" class="btn btn-warning" id="edit">
<span class="glyphicon glyphicon-pencil " aria-hidden="true" onclick="editProjects()"></span>
</button> -->
<!-- <button type="button" class="btn btn-info" id="recalculate">
<span class="glyphicon glyphicon-refresh " aria-hidden="true" onclick="recalculateProjects()"></span>
<br />
<div class="row container" id="projects">
<!--here loading projectlist from GET /projects-->
<img id="wait-img" width="90%" style="display:none" src="{% static "img/ajax-loader.gif"%}"></img>
<div class="row container" id="projects">
<!--here loading projectlist from GET /projects-->
<img id="wait-img" width="90%" style="display:none" src="{% static "img/ajax-loader.gif"%}"></img>
<script type="html/tpl" id="project_item">
<div id="{url}" class="item row">
......@@ -26,8 +26,7 @@ environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext.settings")
DATABASES['default']['NAME'] = DATABASES['default']['TEST']['NAME']
setup() # models can now be imported
from gargantext import models # Base is now filled
from gargantext.util.db import Base # contains metadata.tables
from gargantext.models import Base # contains metadata.tables
# ------------------------------------------------------------------------------
# thanks to our hack, util.db.engine and util.db.session already use the test DB
