Commit c2992964 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] fields in hal.

parents f2be4ea7 7fc480ba
......@@ -2,6 +2,8 @@
* Guided Tour
* Sources form highlighting crawlers
## Version 3.0.7
* Alembic implemented to manage database migrations
## Version 3.0.6.8
* REPEC Crawler (connection with https://multivac.iscpif.fr)
......
# A generic, single database configuration.
[alembic]
# path to migration scripts
script_location = alembic
# template used to generate migration files
# file_template = %%(rev)s_%%(slug)s
# timezone to use when rendering the date
# within the migration file as well as the filename.
# string value is passed to dateutil.tz.gettz()
# leave blank for localtime
# timezone =
# max length of characters to apply to the
# "slug" field
#truncate_slug_length = 40
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false
# version location specification; this defaults
# to alembic/versions. When using multiple version
# directories, initial revisions must be specified with --version-path
# version_locations = %(here)s/bar %(here)s/bat alembic/versions
# the output encoding used when revision files
# are written from script.py.mako
# output_encoding = utf-8
# XXX For database access configuration, see alembic/env.py
#sqlalchemy.url = driver://user:pass@localhost/dbname
[alembic:exclude]
tables = django_* celery_* djcelery_* auth_*
# Logging configuration
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARN
handlers = console
qualname =
[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S
Alembic must be installed in the virtualenv in order to use right python paths,
so it's installed with pip. Commands described in this little documentation
must be executed from gargantext root directory, ie. /srv/gargantext.
Keep in mind that Alembic only handles SQLAlchemy models: tables created from
Django ORM must be put out of Alembic sight. See [alembic:exclude] section in
alembic.ini.
TELL ALEMBIC TO NOT START FROM SCRATCH
# To upgrade a database populated before Alembic usage in Gargantext,
# don't forget to tell Alembic your current version before to run
# "upgrade head" command. If you don't want to do this, you can of course
# drop your database and really start from scratch.
alembic stamp 601e9d9baa4c
UPGRADE TO LATEST DATABASE VERSION
alembic upgrade head
DOWNGRADE TO INITIAL DATABASE STATE
# /!\ RUNNING THIS COMMAND WILL CAUSE ALL DATA LOST WITHOUT ASKING !!
alembic downgrade base
GENERATE A NEW REVISION
alembic revision -m "Message for this migration"
# A migration script is then created in alembic/versions directory. For
# example alembic/versions/3adcc9a56557_message_for_this_migration.py
# where 3adcc9a56557 is the revision id generated by Alembic.
#
# This script must be edited to write the migration itself, mainly
# in `upgrade` and `downgrade` functions. See Alembic documentation for
# further details.
GENERATE A REVISION FROM CURRENT STATE
alembic revision --autogenerate -m "Message for this migration"
# Alembic should generate a script reflecting changes already made in
# database. However it is always a good idea to check it and edit it
# manually, Alembic is not always accurate and can't see all alterations.
# It should work with basic changes such as model or column creation. See
# http://alembic.zzzcomputing.com/en/latest/autogenerate.html#what-does-autogenerate-detect-and-what-does-it-not-detect
from __future__ import with_statement
from alembic import context
from sqlalchemy import engine_from_config, pool
from logging.config import fileConfig
import re
# Add projet root directory in path and setup Django...
import os
import django
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
django.setup()
# ...to be able to import gargantext.
from gargantext import settings, models
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
config.set_main_option("sqlalchemy.url", settings.DATABASES['default']['URL'])
# Interpret the config file for Python logging.
# This line sets up loggers basically.
fileConfig(config.config_file_name)
# add your model's MetaData object here
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
target_metadata = models.Base.metadata
# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.
# Inspired from https://gist.github.com/utek/6163250
def exclude_tables_from_config(config):
tables = config.get("tables", '').replace('*', '.*').split(' ')
pattern = '|'.join(tables)
return re.compile(pattern)
exclude_tables = exclude_tables_from_config(config.get_section('alembic:exclude'))
def include_object(obj, name, typ, reflected, compare_to):
if typ == "table" and exclude_tables.match(name):
return False
else:
return True
def run_migrations_offline():
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url, target_metadata=target_metadata, literal_binds=True,
include_object=include_object)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online():
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
connectable = engine_from_config(
config.get_section(config.config_ini_section),
prefix='sqlalchemy.',
poolclass=pool.NullPool)
with connectable.connect() as connection:
context.configure(
connection=connection,
target_metadata=target_metadata,
include_object=include_object
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from alembic import op
import sqlalchemy as sa
import gargantext
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision = ${repr(up_revision)}
down_revision = ${repr(down_revision)}
branch_labels = ${repr(branch_labels)}
depends_on = ${repr(depends_on)}
def upgrade():
${upgrades if upgrades else "pass"}
def downgrade():
${downgrades if downgrades else "pass"}
"""Put a timezone on Node.date
Revision ID: 08230100f262
Revises: 601e9d9baa4c
Create Date: 2017-07-06 13:47:10.788569
"""
from alembic import op
import sqlalchemy as sa
import gargantext
# revision identifiers, used by Alembic.
revision = '08230100f262'
down_revision = '601e9d9baa4c'
branch_labels = None
depends_on = None
def upgrade():
op.alter_column('nodes', 'date', type_=sa.DateTime(timezone=True))
def downgrade():
op.alter_column('nodes', 'date', type_=sa.DateTime(timezone=False))
"""Add OCC_HIST & OCC_HIST_PART functions
Revision ID: 601e9d9baa4c
Revises: 932dbf3e8c43
Create Date: 2017-07-06 10:52:16.161118
"""
from alembic import op
import sqlalchemy as sa
from gargantext.tools.alembic import ReplaceableObject
# revision identifiers, used by Alembic.
revision = '601e9d9baa4c'
down_revision = '932dbf3e8c43'
branch_labels = None
depends_on = None
# -- OCC_HIST_PART :: Corpus.id -> GroupList.id -> Start -> End
occ_hist_part = ReplaceableObject(
"OCC_HIST_PART(int, int, timestamp, timestamp)",
"""
RETURNS TABLE (ng_id int, score float8)
AS $$
-- EXPLAIN ANALYZE
SELECT
COALESCE(gr.ngram1_id, ng1.ngram_id) as ng_id,
SUM(ng1.weight) as score
from nodes n
-- BEFORE
INNER JOIN nodes as n1 ON n1.id = n.id
INNER JOIN nodes_ngrams ng1 ON ng1.node_id = n1.id
-- Limit with timestamps: ]start, end]
INNER JOIN nodes_hyperdata nh1 ON nh1.node_id = n1.id
AND nh1.value_utc > $3
AND nh1.value_utc <= $4
-- Group List
LEFT JOIN nodes_ngrams_ngrams gr ON ng1.ngram_id = gr.ngram2_id
AND gr.node_id = $2
WHERE
n.typename = 4
AND n.parent_id = $1
GROUP BY 1
$$
LANGUAGE SQL;
"""
)
# -- OCC_HIST :: Corpus.id -> GroupList.id -> MapList.id -> Start -> EndFirst -> EndLast
# -- EXEMPLE USAGE
# -- SELECT * FROM OCC_HIST(182856, 183859, 183866, '1800-03-15 17:00:00+01', '2000-03-15 17:00:00+01', '2017-03-15 17:00:00+01')
occ_hist = ReplaceableObject(
"OCC_HIST(int, int, int, timestamp, timestamp, timestamp)",
"""
RETURNS TABLE (ng_id int, score numeric)
AS $$
WITH OCC1 as (SELECT * from OCC_HIST_PART($1, $2, $4, $5))
, OCC2 as (SELECT * from OCC_HIST_PART($1, $2, $5, $6))
, GROWTH as (SELECT ml.ngram_id as ngram_id
, COALESCE(OCC1.score, null) as score1
, COALESCE(OCC2.score, null) as score2
FROM nodes_ngrams ml
LEFT JOIN OCC1 ON OCC1.ng_id = ml.ngram_id
LEFT JOIN OCC2 ON OCC2.ng_id = ml.ngram_id
WHERE ml.node_id = $3
ORDER by score2 DESC)
SELECT ngram_id, COALESCE(ROUND(CAST((100 * (score2 - score1) / COALESCE((score2 + score1), 1)) as numeric), 2), 0) from GROWTH
$$
LANGUAGE SQL;
"""
)
# -- BEHAVIORAL TEST (should be equal to occ in terms table)
# -- WITH OCC as (SELECT * from OCC_HIST(182856, 183859, '1800-03-15 17:00:00+01', '2300-03-15 17:00:00+01'))
# -- SELECT ng_id, score from OCC
# -- INNER JOIN nodes_ngrams ml on ml.ngram_id = ng_id
# -- AND ml.node_id = 183866
# -- ORDER BY score DESC;
def upgrade():
op.create_sp(occ_hist_part)
op.create_sp(occ_hist)
def downgrade():
op.drop_sp(occ_hist)
op.drop_sp(occ_hist_part)
"""Initial migration
Revision ID: 932dbf3e8c43
Revises:
Create Date: 2017-07-05 16:41:23.951422
"""
from alembic import op
import sqlalchemy as sa
import gargantext
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = '932dbf3e8c43'
down_revision = None
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('contacts',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('user1_id', sa.Integer(), nullable=True),
sa.Column('user2_id', sa.Integer(), nullable=True),
sa.Column('is_blocked', sa.Boolean(), nullable=True),
sa.Column('date_creation', sa.DateTime(), nullable=True),
sa.ForeignKeyConstraint(['user1_id'], ['auth_user.id'], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['user2_id'], ['auth_user.id'], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('user1_id', 'user2_id')
)
op.create_table('ngrams',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('terms', sa.String(length=255), nullable=True),
sa.Column('n', sa.Integer(), nullable=True),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('terms')
)
op.create_index('ngrams_id_n_idx', 'ngrams', ['id', 'n'], unique=False)
op.create_index('ngrams_n_idx', 'ngrams', ['n'], unique=False)
op.create_table('nodes',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('typename', gargantext.models.nodes.NodeType(), nullable=True),
sa.Column('user_id', sa.Integer(), nullable=True),
sa.Column('parent_id', sa.Integer(), nullable=True),
sa.Column('name', sa.String(length=255), nullable=True),
sa.Column('date', sa.DateTime(), nullable=True),
sa.Column('hyperdata', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
sa.ForeignKeyConstraint(['parent_id'], ['nodes.id'], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['user_id'], ['auth_user.id'], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('id')
)
op.create_index(op.f('ix_nodes_typename'), 'nodes', ['typename'], unique=False)
op.create_index('nodes_hyperdata_idx', 'nodes', ['hyperdata'], unique=False)
op.create_index('nodes_user_id_typename_parent_id_idx', 'nodes', ['user_id', 'typename', 'parent_id'], unique=False)
op.create_table('nodes_hyperdata',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('node_id', sa.Integer(), nullable=True),
sa.Column('key', gargantext.models.hyperdata.HyperdataKey(), nullable=True),
sa.Column('value_int', sa.Integer(), nullable=True),
sa.Column('value_flt', postgresql.DOUBLE_PRECISION(), nullable=True),
sa.Column('value_utc', sa.DateTime(timezone=True), nullable=True),
sa.Column('value_str', sa.String(length=255), nullable=True),
sa.Column('value_txt', sa.Text(), nullable=True),
sa.ForeignKeyConstraint(['node_id'], ['nodes.id'], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('id')
)
op.create_index(op.f('ix_nodes_hyperdata_value_flt'), 'nodes_hyperdata', ['value_flt'], unique=False)
op.create_index(op.f('ix_nodes_hyperdata_value_int'), 'nodes_hyperdata', ['value_int'], unique=False)
op.create_index(op.f('ix_nodes_hyperdata_value_str'), 'nodes_hyperdata', ['value_str'], unique=False)
op.create_index(op.f('ix_nodes_hyperdata_value_utc'), 'nodes_hyperdata', ['value_utc'], unique=False)
op.create_index('nodes_hyperdata_node_id_key_idx', 'nodes_hyperdata', ['node_id', 'key'], unique=False)
op.create_index('nodes_hyperdata_node_id_key_value_flt_idx', 'nodes_hyperdata', ['node_id', 'key', 'value_flt'], unique=False)
op.create_index('nodes_hyperdata_node_id_key_value_int_idx', 'nodes_hyperdata', ['node_id', 'key', 'value_int'], unique=False)
op.create_index('nodes_hyperdata_node_id_key_value_str_idx', 'nodes_hyperdata', ['node_id', 'key', 'value_str'], unique=False)
op.create_index('nodes_hyperdata_node_id_key_value_utc_idx', 'nodes_hyperdata', ['node_id', 'key', 'value_utc'], unique=False)
op.create_index('nodes_hyperdata_node_id_value_utc_idx', 'nodes_hyperdata', ['node_id', 'value_utc'], unique=False)
op.create_table('nodes_ngrams',
sa.Column('node_id', sa.Integer(), nullable=False),
sa.Column('ngram_id', sa.Integer(), nullable=False),
sa.Column('weight', sa.Float(), nullable=True),
sa.ForeignKeyConstraint(['ngram_id'], ['ngrams.id'], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['node_id'], ['nodes.id'], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('node_id', 'ngram_id')
)
op.create_index('nodes_ngrams_ngram_id_idx', 'nodes_ngrams', ['ngram_id'], unique=False)
op.create_index('nodes_ngrams_node_id_idx', 'nodes_ngrams', ['node_id'], unique=False)
op.create_index('nodes_ngrams_node_id_ngram_id_idx', 'nodes_ngrams', ['node_id', 'ngram_id'], unique=False)
op.create_table('nodes_ngrams_ngrams',
sa.Column('node_id', sa.Integer(), nullable=False),
sa.Column('ngram1_id', sa.Integer(), nullable=False),
sa.Column('ngram2_id', sa.Integer(), nullable=False),
sa.Column('weight', sa.Float(precision=24), nullable=True),
sa.ForeignKeyConstraint(['ngram1_id'], ['ngrams.id'], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['ngram2_id'], ['ngrams.id'], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['node_id'], ['nodes.id'], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('node_id', 'ngram1_id', 'ngram2_id')
)
op.create_index('nodes_ngrams_ngrams_ngram1_id_idx', 'nodes_ngrams_ngrams', ['ngram1_id'], unique=False)
op.create_index('nodes_ngrams_ngrams_ngram2_id_idx', 'nodes_ngrams_ngrams', ['ngram2_id'], unique=False)
op.create_index('nodes_ngrams_ngrams_node_id_idx', 'nodes_ngrams_ngrams', ['node_id'], unique=False)
op.create_index('nodes_ngrams_ngrams_node_id_ngram1_id_ngram2_id_idx', 'nodes_ngrams_ngrams', ['node_id', 'ngram1_id', 'ngram2_id'], unique=False)
op.create_table('nodes_nodes',
sa.Column('node1_id', sa.Integer(), nullable=False),
sa.Column('node2_id', sa.Integer(), nullable=False),
sa.Column('score', sa.Float(precision=24), nullable=True),
sa.ForeignKeyConstraint(['node1_id'], ['nodes.id'], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['node2_id'], ['nodes.id'], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('node1_id', 'node2_id')
)
op.create_index('nodes_nodes_node1_id_node2_id_idx', 'nodes_nodes', ['node1_id', 'node2_id'], unique=False)
op.create_table('nodes_nodes_ngrams',
sa.Column('node1_id', sa.Integer(), nullable=False),
sa.Column('node2_id', sa.Integer(), nullable=False),
sa.Column('ngram_id', sa.Integer(), nullable=False),
sa.Column('score', sa.Float(precision=24), nullable=True),
sa.ForeignKeyConstraint(['ngram_id'], ['ngrams.id'], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['node1_id'], ['nodes.id'], ondelete='CASCADE'),
sa.ForeignKeyConstraint(['node2_id'], ['nodes.id'], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('node1_id', 'node2_id', 'ngram_id')
)
op.create_index('nodes_nodes_ngrams_node1_id_idx', 'nodes_nodes_ngrams', ['node1_id'], unique=False)
op.create_index('nodes_nodes_ngrams_node2_id_idx', 'nodes_nodes_ngrams', ['node2_id'], unique=False)
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index('nodes_nodes_ngrams_node2_id_idx', table_name='nodes_nodes_ngrams')
op.drop_index('nodes_nodes_ngrams_node1_id_idx', table_name='nodes_nodes_ngrams')
op.drop_table('nodes_nodes_ngrams')
op.drop_index('nodes_nodes_node1_id_node2_id_idx', table_name='nodes_nodes')
op.drop_table('nodes_nodes')
op.drop_index('nodes_ngrams_ngrams_node_id_ngram1_id_ngram2_id_idx', table_name='nodes_ngrams_ngrams')
op.drop_index('nodes_ngrams_ngrams_node_id_idx', table_name='nodes_ngrams_ngrams')
op.drop_index('nodes_ngrams_ngrams_ngram2_id_idx', table_name='nodes_ngrams_ngrams')
op.drop_index('nodes_ngrams_ngrams_ngram1_id_idx', table_name='nodes_ngrams_ngrams')
op.drop_table('nodes_ngrams_ngrams')
op.drop_index('nodes_ngrams_node_id_ngram_id_idx', table_name='nodes_ngrams')
op.drop_index('nodes_ngrams_node_id_idx', table_name='nodes_ngrams')
op.drop_index('nodes_ngrams_ngram_id_idx', table_name='nodes_ngrams')
op.drop_table('nodes_ngrams')
op.drop_index('nodes_hyperdata_node_id_value_utc_idx', table_name='nodes_hyperdata')
op.drop_index('nodes_hyperdata_node_id_key_value_utc_idx', table_name='nodes_hyperdata')
op.drop_index('nodes_hyperdata_node_id_key_value_str_idx', table_name='nodes_hyperdata')
op.drop_index('nodes_hyperdata_node_id_key_value_int_idx', table_name='nodes_hyperdata')
op.drop_index('nodes_hyperdata_node_id_key_value_flt_idx', table_name='nodes_hyperdata')
op.drop_index('nodes_hyperdata_node_id_key_idx', table_name='nodes_hyperdata')
op.drop_index(op.f('ix_nodes_hyperdata_value_utc'), table_name='nodes_hyperdata')
op.drop_index(op.f('ix_nodes_hyperdata_value_str'), table_name='nodes_hyperdata')
op.drop_index(op.f('ix_nodes_hyperdata_value_int'), table_name='nodes_hyperdata')
op.drop_index(op.f('ix_nodes_hyperdata_value_flt'), table_name='nodes_hyperdata')
op.drop_table('nodes_hyperdata')
op.drop_index('nodes_user_id_typename_parent_id_idx', table_name='nodes')
op.drop_index('nodes_hyperdata_idx', table_name='nodes')
op.drop_index(op.f('ix_nodes_typename'), table_name='nodes')
op.drop_table('nodes')
op.drop_index('ngrams_n_idx', table_name='ngrams')
op.drop_index('ngrams_id_n_idx', table_name='ngrams')
op.drop_table('ngrams')
op.drop_table('contacts')
# ### end Alembic commands ###
#!/usr/bin/env python
import sys
import os
if __name__ == "__main__":
# Django settings
dirname = os.path.dirname(os.path.realpath(__file__))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext.settings")
# initialize Django application
from django.core.wsgi import get_wsgi_application
application = get_wsgi_application()
# retrieve Django models
import django.apps
django_models = django.apps.apps.get_models()
django_models_names = set(model._meta.db_table for model in django_models)
# migrate SQLAlchemy models
from gargantext import models
from gargantext.util.db import Base, engine
sqla_models_names = (
model for model in Base.metadata.tables.keys()
if model not in django_models_names
)
sqla_models = (
Base.metadata.tables[model_name]
for model_name in sqla_models_names
)
print()
for model in sqla_models:
try:
model.create(engine)
print('created model: `%s`' % model)
except Exception as e:
print('could not create model: `%s`, %s' % (model, e))
print()
# Definitions and notation for the documentation (!= python notation)
## Node
The table (nodes) is a list of nodes: `[Node]`
Each Node has:
- a typename
- a parent_id
- a name
### Each Node has a parent_id
Node A
├── Node B
└── Node C
If Node A is Parent of Node B and Node C
then NodeA.id == NodeB.parent_id == NodeC.parent_id.
### Each Node has a typename
Notation: `Node["FOO"]("bar")` is a Node of typename "FOO" and with name "bar".
Then:
- Then Node[PROJECT] is a project.
- Then Node[CORPUS] is a corpus.
- Then Node[DOCUMENT] is a document.
The syntax of the Node here do not follow exactly Python documentation
(for clarity and to begin with): in Python code, typenames are strings
represented as UPPERCASE strings (eg. "PROJECT").
### Each Node as a typename and a parent
Node[USER](name)
├── Node[PROJECT](myProject1)
│   ├── Node[CORPUS](myCorpus1)
│   ├── Node[CORPUS](myCorpus2)
│   └── Node[CORPUS](myCorpus3)
└── Node[PROJECT](myProject2)
/!\\ 3 ways to manage rights of the Node:
1. Then Node[User] is a folder containing all User projects and corpus and
documents (i.e. Node[user] is the parent_id of the children).
2. Each node as a user_id (mainly used today)
3. Right management for the groups (implemented already but not
used since not connected to the frontend).
## Global Parameters
Global User is Gargantua (Node with typename user).
This node is the parent of the other nodes for parameters.
Node[USER](gargantua) (gargantua.id == Node[USER].user_id)
├── Node[TFIDF-Global](global) : without group
│   ├── Node[TFIDF](database1)
│   ├── Node[TFIDF](database2)
│   └── Node[TFIDF](database3)
└── Node[ANOTHERMETRIC](global)
[//]: # (Are there any plans to add user wide or project wide parameters or metrics? For example TFIDF nodes related to a normal user -- ie. not Gargantua?)
Yes we can in the future (but we have others priorities before.
[//]: # (What is the purpose of the 3 child nodes of Node[TFIDF-Global]? Are they TFIDF metrics related to databases 1, 2 and 3? If so, shouldn't they be children of related CORPUS nodes?)
Node placement in the tree indicates the context of the metric: the
Metrics Node has parent the corpus Node to indicate the context of the
metrics.
Answer:
Node[USER](foo)
Node[USER](bar)
├── Node[PROJECT](project1)
│   ├── Node[CORPUS](corpus1)
│   │   ├── Node[DOCUMENT](doc1)
│   │   ├── Node[DOCUMENT](doc2)
│   │ └── Node[TFIDF-global](name of the metrics)
│   ├── Node[CORPUS](corpus2)
│   └── Node[CORPUS](corpus3)
└── Node[PROJECT](project2)
## NodeNgram
NodeNgram is a relation of a Node with a ngram:
- documents and ngrams
- metrics and ngrams (position of the node metrics indicates the
context)
# Community Parameters
# User Parameters
......@@ -8,6 +8,9 @@ Gargantext is a web plateform to explore your corpora using text-mining[...](abo
* [Take a tour](demo.md) of the different features offered by Gargantext
## Architecture
* [Architecture](architecture.md) Architecture of Gargantext
##Need some help?
Ask the community at:
......
......@@ -49,11 +49,23 @@ by cloning the repository of gargantext
``` bash
git clone ssh://gitolite@delanoe.org:1979/gargantext /srv/gargantext \
&& cd /srv/gargantext \
&& git fetch origin refactoring \
&& git checkout refactoring \
&& git fetch origin stable \
&& git checkout stable \
```
TODO(soon): git clone https://gogs.iscpif.fr/gargantext.git
* Install and configure the virtual environment
``` bash
cd /srv/
pip3 install virtualenv
virtualenv /srv/env_3-5 -p /usr/bin/python3.5
pip install -r /srv/gargantext/install
echo '/srv/gargantext' > /srv/env_3-5/lib/python3.5/site-packages/gargantext.pth
echo 'alias venv="source /srv/env_3-5/bin/activate"' >> ~/.bashrc
```
See the [next steps of installation procedure](install.md#Install)
See the [next manual steps of installation procedure](Debian.sh)
# Gargantext foundations
Collaborative platform for multi-scale text experiments
Embrace the past, update the present, forecast the future.
# Main Types of Entity definitions
Documentation valid for 3.0.\* versions of Gargantext.
## Nature of the entities
In Object programming language, it is objects.
In purely functional language, it is types.
## Project
A project is a list of corpora (a project may have duplicate corpora).
## Corpus
A corpus is a set of documents: duplicate documents are authorized but
not recommended for the methodology since it shows artificial repeated content in the corpus.
Then, in the document view, users may delete duplicates with a specific
function.
## Document
A document is the main Entity of Textual Context (ETC) that is composed with:
- a title (truncated field name in the database)
- the date of publication
- a journal (or source)
- an abstract
- the authors
Users may add many fields to the document.
The main fields mentioned above are used for the main statistics in Gargantext.
### Source Type
Source Type is the source (database) from where documents have been
extracted.
In 3.0.\* versions of Gargantext, each corpus has only one source type
(i.e database). But user can build his own corpus with CVS format.
## Ngrams
### Definitions
### Gram
A gram is a contiguous sequence of letters separated by spaces.
### N-gram
N-gram is a contiguous sequence of n grams separated by spaces (where n
is a non negative natural number).
## N-gram Lists
## Main ngrams lists: Stop/Map/Main
Receipe of Gargantext consist of offering the rights ngrams for the map.
A the better level of complexity in order to unveil its richness
according to this 2 main rules:
If ngrams are too specifics, then the graph becomes too sparse.
If ngrams are too generics, then the graph becomes too connected.
As a consequence, finding the right balance of specific and generic
ngrams is the main target.
In first versions of Gargantext, this balance is solved with linear
methods. After 3.1.\*, non linear methods trained on dataset of the
users enable the system to find a better balance at any scale.
### Definition
3 main kinds of lists :
1. Stop List contains black listed ngrams i.e. the noise or in others words ngrams users do not want to deal with.
2. Map List contains ngrams that will be shown in the map.
3. Main list or Candidate list contains all other ngrams that are neither in the stop list or in the map list. Then it _could_ be in the map according to the choice of the user or, by default, the default parameters of Gargantext.
### Storage
Relation between the list and the ngram is stored as Node-Ngram
relation where
- Node has type name (STOP|MAIN|MAP) and parent_id the context
(CORPUS in version 3.0.*; but could be PROJECT)
- Ngrams depend on the context of the Node List where NodeNgrams is
not null and Node has typename Document.
Node[USER](name1)
├── Node[PROJECT](project1)
│   ├── Node[CORPUS](corpus1)
│   │   ├── Node[MAPLIST](list name)
│   │   ├── Node[STOPLIST](list name)
│   │   ├── Node[MAINLIST](list name)
│ │  │  
│   │   ├── Node[DOCUMENT](doc1)
│   │   ├── Node[DOCUMENT](doc2)
│   │ └── Node[DOCUMENT](doc2)
### Policy
#### Algo
Let be a set of ngrams where NodeNgram != 0 then
find 2 subsets of these ngrams that show a split
- stop ngrams
- not stop ngrams
then for the subset "not stop ngrams"
find 2 subset of ngrams that show a split:
- map ngrams
- others ngrams
#### Techno algo
A classifier (Support Machine Vector) is used on the following scaled-measures
for each step:
- n (of the "n" gram)
- Occurrences : Zip Law (in fact already used in TFICF, this
features are correletad, put here for pedagogical purpose)
- TFICF-CORPUS-SOURCETYPE
- TFICF-SOURCETYPE-ALL
- Genericity score
- Specificty score
## Metrics
### Term Frequency - Inverse Context Frequency (TF-ICF)
TFICF, short for term frequency-inverse context frequency, is a numerical
statistic that is intended to reflect how important an ngram is to a
context of text.
TFICF(ngram,contextLocal,contextGlobal) = TF(ngram,contextLocal) \* ICF(ngram, contextGlobal)
where
* TF(ngram, contextLocal) is the ngram frequency (occurrences) in contextLocal.
* ICF(ngram, contextGlobal) is the inverse (log) document frequency (occurrences) in contextGlobal.
Others types of TFICF:
- TFICF(ngram, DOCUMENT, CORPUS)
- TFICF(ngram, CORPUS, PROJECT)
- TFICF(ngram, PROJECT, DATABASETYPE)
- TFICF(ngram, DATABASETYPE, ALL)
If the context is a document in a set of documents (corpus), then it is a TFIDF as usual.
Then TFICF-DOCUMENT-CORPUS == TFICF(ngram,DOCUMENT,CORPUS) = TFIDF.
TFICF is the generalization of [TFIDF, Term Frequency - Inverse Document Frequency](https://en.wikipedia.org/wiki/Tf%E2%80%93idf).
#### Implementation
TFICF = TF * log (ICF)
To prepare the groups, we need to store TF and ICF seperately (in
NodesNogram via 2 nodes).
Let be TF and ICF typename of Nodes.
Node[USER](gargantua)
├── Node[OCCURRENCES](source)
├── Node[TF](all sourcetype)
├── Node[ICF](all sourcetype)
├── Node[SOURCETYPE](Pubmed)
│   ├── Node[OCCURRENCES](all corpora)
│   ├── Node[TF](all corpora)
│   └── Node[ICF](all corpora)
├── Node[SOURCETYPE](WOS)
## others ngrams lists
### Group List
#### Definition
Group list gives a quantifiable link between two ngrams.
#### Policy to build group lists
To group the ngrams:
- stemming or lemming
- c-value
- clustering (see graphs)
- manually by the user (supervised learning)
The scale is the character.
#### Storage
In the table NodeNgramNgram where Node has type name Group for ngram1
and ngram2.
### Favorite List
#### Definition
Fovorite Nodes
The scale is the node.
#### Building policy
- manually by the user (supervised learning)
#### Storage
NodeNode relation where first Node has type Favorite.
#Gargantext
Welcome to Garagentext documentation!
List of garg's own JSON API(s) urls
===================================
2016-05-27
### /api/nodes/2
```
{
"id": 2,
"parent_id": 1,
"name": "abstract:\"evaporation+loss\"",
"typename": "CORPUS"
}
```
------------------------------
### /api/nodes?pagination_limit=-1
```
{
"records": [
{
"id": 9,
"parent_id": 2,
"name": "A recording evaporimeter",
"typename": "DOCUMENT"
},
(...)
{
"id": 119,
"parent_id": 81,
"name": "GRAPH EXPLORER COOC (in:81)",
"typename": "COOCCURRENCES"
}
],
"count": 119,
"parameters": {
"formated": "json","pagination_limit": -1,
"fields": ["id","parent_id","name","typename"],
"pagination_offset": 0
}
}
```
------------------------------
### /api/nodes?types[]=CORPUS
```
{
"records": [
{
"id": 2,
"parent_id": 1,
"name": "abstract:\"evaporation+loss\"",
"typename": "CORPUS"
},
(...)
{
"id": 8181,
"parent_id": 1,
"name": "abstract:(astrogeology+OR ((space OR spatial) AND planetary) AND geology)",
"typename": "CORPUS"
}
],
"count": 2,
"parameters": {
"pagination_limit": 10,
"types": ["CORPUS"],
"formated": "json",
"pagination_offset": 0,
"fields": ["id","parent_id","name","typename"]
}
}
```
------------------------------
### /api/nodes/5?fields[]=ngrams
<5> représente un doc_id ou list_id
```
{
"ngrams": [
[1.0,{"id":2299,"n":1,"terms":designs}],
[1.0,{"id":1917,"n":1,"terms":height}],
[1.0,{"id":1755,"n":2,"terms":higher speeds}],
[1.0,{"id":1940,"n":1,"terms":cylinders}],
[1.0,{"id":2221,"n":3,"terms":other synthesized materials}],
(...)
[2.0,{"id":1970,"n":1,"terms":storms}],
[9.0,{"id":1754,"n":2,"terms":spherical gauges}],
[1.0,{"id":1895,"n":1,"terms":direction}],
[1.0,{"id":2032,"n":1,"terms":testing}],
[1.0,{"id":1981,"n":2,"terms":"wind effects"}]
]
}
```
------------------------------
### api/nodes/3?fields[]=id&fields[]=hyperdata&fields[]=typename
```
{
"id": 3,
"typename": "DOCUMENT",
"hyperdata": {
"language_name": "English",
"language_iso3": "eng",
"language_iso2": "en",
"title": "A blabla analysis of laser treated aluminium blablabla",
"name": "A blabla analysis of laser treated aluminium blablabla",
"authors": "A K. Jain, V.N. Kulkarni, D.K. Sood"
"authorsRAW": [
{"name": "....", "affiliations": ["... Research Centre,.. 085, Country"]},
{"name": "....", "affiliations": ["... Research Centre,.. 086, Country"]}
(...)
],
"abstract": "Laser processing of materials, being a rapid melt quenching process, quite often produces a surface which is far from being ideally smooth for ion beam analysis. (...)",
"genre": ["research-article"],
"doi": "10.1016/0029-554X(81)90998-8",
"journal": "Nuclear Instruments and Methods In Physics Research",
"publication_year": "1981",
"publication_date": "1981-01-01 00:00:00",
"publication_month": "01",
"publication_day": "01",
"publication_hour": "00",
"publication_minute": "00",
"publication_second": "00",
"id": "61076EB1178A97939B1C893904C77FB7DA2276D0",
"source": "elsevier",
"distributor": "istex"
}
}
```
## TODO continuer la liste
// dot ngram_parsing_flow.dot -Tpng -o ngram_parsing_flow.png
digraph ngramflow {
edge [fontsize=10] ;
label=<<B><U>gargantext.util.toolchain</U></B><BR/>(ngram extraction flow)>;
labelloc="t" ;
"extracted_ngrams" -> "grouplist" ;
"extracted_ngrams" -> "occs+ti_rank" ;
"project stoplist (todo)" -> "stoplist" ;
"stoplist" -> "mainlist" ;
"occs+ti_rank" -> "mainlist" [label=" TI_RANK_LIMIT"];
"mainlist" -> "coocs" [label=" COOCS_THRESHOLD"] ;
"coocs" -> "specificity" ;
"specificity" -> "maplist" [label="MAPLIST_LIMIT\nMONOGRAM_PART"];
"mainlist" -> "tfidf" ;
"tfidf" -> "explore" [label="doc relations with all map and candidates"];
"maplist" -> "explore" ;
"grouplist" -> "occs+ti_rank" ;
"grouplist" -> "coocs" ;
"grouplist" -> "tfidf" ;
}
#Contribution guide
## Community
* [http://gargantext.org/about](http://gargantext.org/about)
* IRC Chat: (OFTC/FreeNode) #gargantex
##Tools
* gogs
* server access
* forge
* gargantext box
##Gargantex
* Gargantex box install
(S.I.R.= Setup Install & Run procedures)
* Architecture Overview
* Database Schema Overview
* Interface design Overview
##To do:
* Docs
* Interface deisgn
* Parsers/scrapers
* Computing
## How to contribute:
1. Clone the repo
2. Create a new branch <username>-refactoring
3. Run the gargantext-box
4. Code
5.Test
6. Commit
### Exemple1: Adding a parser
* create your new file cern.py into gargantex/scrapers/
* reference into gargantex/scrapers/urls.py
add this line:
import scrapers.cern as cern
* reference into gargantext/constants
```
# type 9
{ 'name': 'Cern',
'parser': CernParser,
'default_language': 'en',
},
```
* add an APIKEY in gargantex/settings
### Exemple2: User Interface Design
#Contribution guide
* A question or a problem? Ask the community
* Sources
* Tools
* Contribution workflow: for contributions, bugs and features
* Some examples of contributions
## Community
Need help? Ask the community
* [http://gargantext.org/about](http://gargantext.org/about)
* IRC Chat: (OFTC/FreeNode) #gargantex
## Source
Source are available throught XXX LICENSE
You can install Gargantext throught the [installation procedure](./install.md)
##Tools
* gogs
* forge.iscpif.fr
* server access
* gargantext box
## Contributing: workflow procedure
Once you have installed and tested Gargantext
You
1. Clone the stable release into your project
Note: The current stable release <release_branch> is: refactoring
Inside the repo, clone the reference branch and get the last changes:
git checkout <ref_branch>
git pull
It is highly recommended to create a generic branch on a stable release such as
git checkout -b <username>-<release_branch>
git pull
2. Create your project on stable release
git checkout -b <username>-<release_branch>-<project_name>
Do your modifications and commits as you want it:
git commit -m "foo/bar/1"
git commit -m "foo/bar/2"
git push
If you want to save your local change you can merge it into your generic branch <username>-<release_branch>
git checkout <username>-<release_branch>
git pull
git merge <username>-<release_branch>-<project_name>
git commit -m "[Merge OK] comment"
##Technical Overview
* Interface Overview
* Database Schema Overview
* Architecture Overview
### Exemple1: Adding a parser
### Exemple2: User Interface Design
Cycle de vie des décomptes ngrammes
-----------------------------------
### (schéma actuel et pistes) ###
Dans ce qui crée les décomptes, on peut distinguer deux niveaux ou étapes:
1. l'extraction initiale et le stockage du poids de la relation ngramme
document (appelons ces nodes "1doc")
2. tout le reste: la préparation des décomptes agrégés pour la table
termes ("stats"), et pour les tables de travail des graphes et de la
recherche de publications.
On pourrait peut-être parler d'indexation par docs pour le niveau 1 et de "modélisations" pour le niveau 2.
On peut remarquer que le niveau 1 concerne des **formes** ou ngrammes seuls (la forme observée <=> chaine de caractères u-nique après normalisation) tandis que dans le niveau 2 on a des objets plus riches... Au fur et à mesure des traitements on a finalement toujours des ngrammes mais:
- filtrés (on ne calcule pas tout sur tout)
- typés avec les listes map, stop, main (et peut-être bientôt des
"ownlistes" utilisateur)...
- groupés (ce qu'on voit avec le `+` de la table terme, et qu'on
pourrait peut-être faire apparaître aussi côté graphe?)
On peut dire qu'on manipule plutôt des **termes** au niveau 2 et non plus des **formes**... ils sont toujours des ngrammes mais enrichis par l'inclusion dans une série de mini modèles (agrégations et typologie de ngrammes guidée par les usages).
### Tables en BDD
Si on adopte cette distinction entre formes et termes, ça permet de clarifier à quel moment on doit mettre à jour ce qu'on a dans les tables. Côté structure de données, les décomptes sont toujours stockés via des n-uplets qu'on peut du coup résumer comme cela:
- **1doc**: (doc:node - forme:ngr - poids:float) dans des tables
NodeNgram
- **occs/gen/spec/tirank**: (type_mesure:node - terme:ngr -
poids:float) dans des tables NodeNgram
- **cooc**: (type_graphe:node - terme1:ngr - terme2:ngr -
poids:float) dans des tables NodeNgramNgram
- **tfidf**: (type_lienspublis:node - doc:node - terme:ngr -
correlation:float) dans des tables NodeNodeNgram.
Où "type" est le node portant la nature de la stat obtenue, ou bien la
ref du graphe pour cooc et de l'index lié à la recherche de publis pour
le tfidf.
Il y a aussi les relations qui ne contiennent pas de décomptes mais sont
essentielles pour former les décomptes des autres:
- map/main/stopliste: (type_liste:node - forme ou terme:ngr) dans des
tables NodeNgram
- "groupes": (mainform:ngr - subform:ngr) dans des tables
NodeNgramNgram.
### Scénarios d'actualisation
Alors, dans le déroulé des "scénarios utilisateurs", il y plusieurs
évenements qui viennent **modifier ces décomptes**:
1. les créations de termes opérés par l'utilisateur (ex: par
sélection/ajout dans la vue annotation)
2. les imports de termes correspondant à des formes jamais indexées sur
ce corpus
3. les dégroupements de termes opérés par l'utilisateur
4. le passage d'un terme de la stopliste aux autres listes
5. tout autre changement de listes et/ou création de nouveaux
groupes...
A et B sont les deux seules étapes hormis l'extraction initiale où des
formes sont rajoutées. Actuellement A et B sont gérés tout de suite pour
le niveau 1 (tables par doc) : il me semble qu'il est bon d'opérer la
ré-indexation des 1doc le plus tôt possible après A ou B. Pour la vue
annotations, l'utilisateur s'attend à voir apparaître le surlignage
immédiatement sur le doc visualisé. Pour l'import B, c'est pratique car
on a la liste des nouveaux termes sous la main, ça évite de la stocker
quelque part en attendant un recalcul ultérieur.
L'autre info mise à jour tout de suite pour A et B est l'appartenance
aux listes et aux groupes (pour B), qui ne demandent aucun calcul.
C, D et E n'affectent pas le niveau 1 (tables par docs) car ils ne
rajoutent pas de formes nouvelles, mais constituent des modifications
sur les listes et les groupes, et devront donc provoquer une
modification du tfidf (pour cela on doit passer par un re-calcul) et des
coocs sur map (effet appliqué à la demande d'un nouveau graphe).
C et D demandent aussi une mise à jour des stats par termes
(occurrences, gen/spec etc) puisque les éléments subforms et les
éléments de la stopliste ne figurent pas dans les stats.
Donc pour résumer on a dans tous les cas:
=> l'ajout à une liste, à un groupe et tout éventuel décompte de
nouvelle forme dans les docs sont gérés dès l'action utilisateur
=> mais les modélisations plus "avancées" représentées par les les
stats occs, gen, spec et les tables de travail "coocs sur map" et
"tfidf" doivent attendre un recalcul.
Idéalement à l'avenir il seraient tous mis à jour incrémentalement au
lieu de forcer ce recalcul... mais pour l'instant on en est là.
### Fonctions associées
| | GUI | API action → url | VIEW | SUBROUTINES |
|-------|-------------------------------------------------------|-----------------------------------------------------------------------------------------------|-------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------|
| A | "annotations/highlight.js, annotations/ngramlists.js" | "PUT → api/ngrams, PUT/DEL → api/ngramlists/change" | "ApiNgrams, ListChange" | util.toolchain.ngrams_addition.index_new_ngrams |
| B | NGrams_dyna_chart_and_table | POST/PATCH → api/ngramlists/import | CSVLists | "util.ngramlists_tools.import_ngramlists, util.ngramlists_tools.merge_ngramlists, util.toolchain.ngrams_addition.index_new_ngrams" |
| C,D,E | NGrams_dyna_chart_and_table | "PUT/DEL → api/ngramlists/change, PUT/DEL → api/ngramlists/groups" "ListChange, GroupChange" | util.toolchain.ngrams_addition.index_new_ngrams | |
L'import B a été remis en route il y a quelques semaines, et je viens de
reconnecter A dans la vue annotations.
#Contribution guide
## Community
* [http://gargantext.org/about](http://gargantext.org/about)
* IRC Chat: (OFTC/FreeNode) #gargantex
##Tools
* gogs
* server access
* gargantext box
##Gargantex
* Gargantex box install
see [install procedure](install.md)
* Architecture Overview
* Database Schema Overview
* Interface design Overview
##To do:
* Docs
* Interface design
* [Parsers](./overview/parser.md) / scrappers(./overview/scraper.md)
* Computing
## How to contribute:
1. Clone the repo
2. Create a new branch <username>-refactoring
3. Run the gargantext-box
4. Code
5. Test
6. Commit
94eb7bdf57557b72dcd1b93a42af044b pubmed.zip
# API
Be more careful about authorizations.
cf. "ng-resource".
# Projects
## Overview of all projects
- re-implement deletion
## Single project view
- re-implement deletion
# Taggers
Path for data used by taggers should be defined in `gargantext.constants`.
# Database
# Sharing
Here follows a brief description of how sharing could be implemented.
## Database representation
The database representation of sharing can be distributed among 4 tables:
- `persons`, of which items represent either a user or a group
- `relationships` describes the relationships between persons (affiliation
of a user to a group, contact between two users, etc.)
- `nodes` contains the projects, corpora, documents, etc. to share (they shall
inherit the sharing properties from their parents)
- `permissions` stores the relations existing between the three previously
described above: it only consists of 2 foreign keys, plus an integer
between 1 and 3 representing the level of sharing and the start date
(when the sharing has been set) and the end date (when necessary, the time
at which sharing has been removed, `NULL` otherwise)
## Python code
The permission levels should be set in `gargantext.constants`, and defined as:
```python
PERMISSION_NONE = 0 # 0b0000
PERMISSION_READ = 1 # 0b0001
PERMISSION_WRITE = 3 # 0b0011
PERMISSION_OWNER = 7 # 0b0111
```
The requests to check for permissions (or add new ones) should not be rewritten
every time. They should be "hidden" within the models:
- `Person.owns(node)` returns a boolean
- `Person.can_read(node)` returns a boolean
- `Person.can_write(node)` returns a boolean
- `Person.give_right(node, permission)` gives a right to a given user
- `Person.remove_right(node, permission)` removes a right from a given user
- `Person.get_nodes(permission[, type])` returns an iterator on the list of
nodes on which the person has at least the given permission (optional
argument: type of requested node)
- `Node.get_persons(permission[, type])` returns an iterator on the list of
users who have at least the given permission on the node (optional argument:
type of requested persons, such as `USER` or `GROUP`)
## Example
Let's imagine the `persons` table contains the following data:
| id | type | username |
|----|-------|-----------|
| 1 | USER | David |
| 2 | GROUP | C.N.R.S. |
| 3 | USER | Alexandre |
| 4 | USER | Untel |
| 5 | GROUP | I.S.C. |
| 6 | USER | Bidule |
Assume "David" owns the groups "C.N.R.S." and "I.S.C.", "Alexandre" belongs to
the group "I.S.C.", with "Untel" and "Bidule" belonging to the group "C.N.R.S.".
"Alexandre" and "David" are in contact.
The `relationships` table then contains:
| person1_id | person2_id | type |
|------------|------------|---------|
| 1 | 2 | OWNER |
| 1 | 5 | OWNER |
| 3 | 2 | MEMBER |
| 4 | 5 | MEMBER |
| 6 | 5 | MEMBER |
| 1 | 3 | CONTACT |
The `nodes` table is populated as such:
| id | type | name |
|----|----------|----------------------|
| 12 | PROJECT | My super project |
| 13 | CORPUS | A given corpus |
| 13 | CORPUS | The corpus |
| 14 | DOCUMENT | Some document |
| 15 | DOCUMENT | Another document |
| 16 | DOCUMENT | Yet another document |
| 17 | DOCUMENT | Last document |
| 18 | PROJECT | Another project |
| 19 | PROJECT | That project |
If we want to express that "David" created "My super project" (and its children)
and wants everyone in "C.N.R.S." to be able to view it, but not access it,
`permissions` should contain:
| person_id | node_id | permission |
|-----------|---------|------------|
| 1 | 12 | OWNER |
| 2 | 12 | READ |
If "David" also wanted "Alexandre" (and no one else) to view and modify "The
corpus" (and its children), we would have:
| person_id | node_id | permission |
|-----------|---------|------------|
| 1 | 12 | OWNER |
| 2 | 12 | READ |
| 3 | 13 | WRITE |
If "Alexandre" created "That project" and wants "Bidule" (and no one else) to be
able to view and modify it (and its children), the table should then have:
| person_id | node_id | permission |
|-----------|---------|------------|
| 3 | 19 | OWNER |
| 6 | 19 | WRITE |
#User guide
1. Login
run the gargantex box following the install procedure
open a webrowser at http://127.0.0.1:8000/
click on Test Gargantext
login with:
```
Login : gargantua
Password : autnagrag
```
2. Create a project
3. Import an existing corpus
4. Create corpus from search
5. Explore stats
6. Explore graphs
7. Query
8. Refine
* Time periods
* Nodes
9. Export
#Architecture Overview
#Database Schema
#Website
Gargantext is a web plateform to explore your corpora using text-mining[...](about.md)
## Getting started
* [Install](install.md) the Gargantext box
* [Take a tour](demo.md) of the different features offered by Gargantext
##Need some help?
Ask the community at:
* [http://gargantext.org/about](http://gargantext.org/about)
* IRC Chat: (OFTC/FreeNode) #gargantex
##Want to contribute?
* take a look at the [architecture overview](overview.md)
* read the [contribution guide](contribution-guide.md)
## News
## Credits and acknowledgments
#Install Instructions for Gargamelle:
Gargamelle is the gargantext plateforme toolbox it is a full plateform system
with minimal modules
First you need to get the source code to install it
The folder will be /srv/gargantext:
* docs containes all informations on gargantext
/srv/gargantext/docs/
* install contains all the installation files
/srv/gargantext/install/
Help needed ?
See [http://gargantext.org/about](http://gargantext.org/about) and [tools](./contribution_guide.md) for the community
## Get the source code
by cloning gargantext into /srv/gargantext
``` bash
git clone ssh://gitolite@delanoe.org:1979/gargantext /srv/gargantext \
&& cd /srv/gargantext \
&& git fetch origin stable \
&& git checkout stable \
```
## Install
```bash
# go into the directory
user@computer: cd /srv/gargantext/
#git inside installation folder
user@computer: cd /install
#execute the installation
user@computer: ./install
```
The installation requires to create a user for gargantext, it will be asked:
```bash
Username (leave blank to use 'gargantua'):
#email is not mandatory
Email address:
Password:
Password (again):
```
If successfully done this step you should see:
```bash
Superuser created successfully.
[ ok ] Stopping PostgreSQL 9.5 database server: main.
```
## Run
Once you proceed to installation Gargantext plateforme will be available at localhost:8000
to start gargantext plateform:
``` bash
# go into the directory
user@computer: cd /srv/gargantext/
#git inside installation folder
user@computer: ./start
#type ctrl+d to exit or simply type exit in terminal;
```
Then open up a chromium browser and go to localhost:8000
Click on "Enter Gargantext"
Login in with you created username and pasword
Enjoy! ;)
* Create user gargantua
Main user of Gargantext is Gargantua (role of Pantagruel soon)!
``` bash
sudo adduser --disabled-password --gecos "" gargantua
```
* Create the directories you need
here for the example gargantext package will be installed in /srv/
``` bash
for dir in "/srv/gargantext"
"/srv/gargantext_lib"
"/srv/gargantext_static"
"/srv/gargantext_media"
"/srv/env_3-5"; do
sudo mkdir -p $dir ;
sudo chown gargantua:gargantua $dir ;
done
```
You should see:
```bash
$tree /srv
/srv
├── gargantext
├── gargantext_lib
├── gargantext_media
│   └── srv
│   └── env_3-5
└── gargantext_static
```
* Get the main libraries
Download uncompress and make main user access to it.
PLease, Be patient due to the size of the packages libraries (27GO)
this step can be long....
``` bash
wget http://dl.gargantext.org/gargantext_lib.tar.bz2 \
&& tar xvjf gargantext_lib.tar.bz2 -o /srv/gargantext_lib \
&& sudo chown -R gargantua:gargantua /srv/gargantext_lib \
&& echo "Libs installed"
```
* Get the source code of Gargantext
by cloning the repository of gargantext
``` bash
git clone ssh://gitolite@delanoe.org:1979/gargantext /srv/gargantext \
&& cd /srv/gargantext \
&& git fetch origin refactoring \
&& git checkout refactoring \
```
TODO(soon): git clone https://gogs.iscpif.fr/gargantext.git
See the [next steps of installation procedure](install.md#Install)
#Architecture Overview
#Database Schema
#Website
# HOW TO: Reference a new webscrapper/API + parser
## Global scope
Three main mooves to do:
- develop and index parser
in gargantext.util.parsers
- developp and index a scrapper
in gargantext.moissonneurs
- adapt forms for a new source
in templates and views
## Reference parser into gargantext website
gargantext website is stored in gargantext/gargantext
### reference your new parser into contants.py
* import your parser l.125
```
from gargantext.util.parsers import \
EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser
```
The parser corresponds to the name of the parser referenced in gargantext/util/parser
here name is CernParser
* index your RESOURCETYPE
int RESOURCETYPES (l.145) **at the end of the list**
```
# type 10
{ "name": 'SCOAP (XML MARC21 Format)',
"parser": CernParser,
"default_language": "en",
'accepted_formats':["zip","xml"],
},
```
A noter le nom ici est composé de l'API_name(SCOAP) + (GENERICFILETYPE FORMAT_XML Format)
La complexité du nommage correspond à trois choses:
* le nom de l'API (different de l'organisme de production)
* le type de format: XML
* la norme XML de ce format : MARC21 (cf. CernParser in gargantext/util/parser/Cern.py )
The default_langage corresponds to the default accepted lang that **should load** the default corresponding tagger
```
from gargantext.util.taggers import NltkTagger
```
TO DO: charger à la demander les types de taggers en fonction des langues et de l'install
TO DO: proposer un module pour télécharger des parsers supplémentaires
TO DO: provide install tagger module scripts inside lib
Les formats correspondent aux types de fichiers acceptées lors de l'envoi du fichier dans le formulaire de
parsing disponible dans `gargantext/view/pages/projects.py` et
exposé dans `/templates/pages/projects/project.html`
## reference your parser script
## add your parser script into folder gargantext/util/parser/
here my filename was Cern.py
##declare it into gargantext/util/parser/__init__.py
from .Cern import CernParser
At this step, you will be able to see your parser and add a file with the form
but nothing will occur
## the good way to write the scrapper script
Three main and only requirements:
* your parser class should inherit from the base class _Parser()
`gargantext/gargantext/util/parser/_Parser`
* your parser class must have a parse method that take a **file buffer** as input
* you parser must structure and store data into **hyperdata_list** variable name
to be properly indexed by toolchain
! Be careful of date format: provide a publication_date in a string format YYYY-mm-dd HH:MM:SS
# Adding a scrapper API to offer search option:
En cours
* Add pop up question Do you have a corpus
option search in /templates/pages/projects/project.html line 181
## Reference a scrapper (moissonneur) into gargantext
* adding accepted_formats in constants
* adding check_file routine in Form check ==> but should inherit from utils/files.py
that also have implmented the size upload limit check
# Suggestion 4 next steps:
* XML parser MARC21 UNIMARC ...
* A project type is qualified by the first element add i.e:
the first element determine the type of corpus of all the corpora within the project
#resources
Adding a new source into Gargantext requires a previous declaration
of the source inside constants.py
```python
RESOURCETYPES= [
{ "type":9, #give a unique type int
"name": 'SCOAP [XML]', #resource name as proposed into the add corpus FORM [generic format]
"parser": "CernParser", #name of the new parser class inside a CERN.py file (set to None if not implemented)
"format": 'MARC21', #specific format
'file_formats':["zip","xml"],# accepted file format
"crawler": "CernCrawler", #name of the new crawler class inside a CERN.py file (set to None if no Crawler implemented)
'default_languages': ['en', 'fr'], #supported defaut languages of the source
},
...
]
```
## adding a new parser
Once you declared your new parser inside constants.py
add your new crawler file into /srv/gargantext/utils/parsers/
following this naming convention:
* Filename must be in uppercase without the Crawler mention.
eg. MailParser => MAIL.py
* Inside this file the Parser must be called following the exact typo declared as parser in constants.py
* Your new crawler shall inherit from baseclasse Parser and provide a parse(filebuffer) method
```python
#!/usr/bin/python3 env
#filename:/srv/gargantext/util/parser/MAIL.py:
from ._Parser import Parser
class MailParser(Parser):
def parse(self, file):
...
```
## adding a new crawler
Once you declared your new parser inside constants.py
add your new crawler file into /srv/gargantext/utils/parsers/
following this naming convention:
* Filename must be in uppercase without the Crawler mention.
eg. MailCrawler => MAIL.py
* Inside this file the Crawler must be called following the exact typo declared as crawler in constants.py
* Your new crawler shall inherit from baseclasse Crawler and provide three method:
* scan_results => ids
* sample = > yes/no
* fetch
```python
#!/usr/bin/python3 env
#filename:/srv/gargantext/util/crawler/MAIL.py:
from ._Crawler import Crawler
class MailCrawler(Crawler):
def scan_results(self, query):
...
self.ids = set()
def sample(self, results_nb):
...
def fetch(self, ids):
```
// dot ngram_parsing_flow.dot -Tpng -o ngram_parsing_flow.png
digraph ngramflow {
edge [fontsize=10] ;
label=<<B><U>gargantext.util.toolchain</U></B><BR/>(ngram extraction flow)>;
labelloc="t" ;
"extracted_ngrams" -> "grouplist" ;
"extracted_ngrams" -> "occs+tfidfs" ;
"main_user_stoplist" -> "stoplist" ;
"stoplist" -> "mainlist" ;
"occs+tfidfs" -> "mainlist" [label=" TFIDF_LIMIT"];
"mainlist" -> "coocs" [label=" COOCS_THRESHOLD"] ;
"coocs" -> "specificity" ;
"specificity" -> "maplist" [label="MAPLIST_LIMIT\nMONOGRAM_PART"];
"maplist" -> "explore" ;
"grouplist" -> "maplist" ;
}
from .celery import app as celery_app
"""
Setup the Celery instance (see also gargantext/__init__.py) that will be
used by all shared_task.
This is the recommended way:
http://docs.celeryproject.org/en/3.1/django/first-steps-with-django.html
"""
import os
from celery import Celery
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
from django.conf import settings #noqa
app = Celery('gargantext')
app.config_from_object('django.conf:settings')
app.autodiscover_tasks(lambda: settings.INSTALLED_APPS)
......@@ -59,25 +59,25 @@ LISTTYPES = {
NODETYPES = [
# TODO separate id not array index, read by models.node
None, # 0
# documents hierarchy
# node/file hierarchy
'USER', # 1
'PROJECT', # 2
#RESOURCE should be here but last
'CORPUS', # 3
'DOCUMENT', # 4
# lists
# lists of ngrams
'STOPLIST', # 5
'GROUPLIST', # 6
'MAINLIST', # 7
'MAPLIST', # 8
'COOCCURRENCES', # 9
# scores
# scores for ngrams
'OCCURRENCES', # 10
'SPECCLUSION', # 11
'CVALUE', # 12
'TFIDF-CORPUS', # 13
'TFIDF-GLOBAL', # 14
# docs subset
# node subset
'FAVORITES', # 15
# more scores (sorry!)
......@@ -197,7 +197,7 @@ RESOURCETYPES = [
'crawler': None,
},
{ 'type': 3,
'name': 'Pubmed [CRAWLER/XML]',
'name': 'Pubmed [XML]',
'format': 'Pubmed',
'parser': "PubmedParser",
'file_formats':["zip", "xml"],
......@@ -233,14 +233,14 @@ RESOURCETYPES = [
'crawler': None,
},
{ 'type': 8,
'name': 'ISTex [CRAWLER]',
'name': 'ISTex',
'format': 'json',
'parser': "ISTexParser",
'file_formats':["zip", "txt"],
'crawler': None,
},
{ "type": 9,
"name": 'SCOAP [CRAWLER/XML]',
"name": 'SCOAP [API/XML]',
"parser": "CernParser",
"format": 'MARC21',
'file_formats':["zip","xml"],
......@@ -255,7 +255,7 @@ RESOURCETYPES = [
# },
#
{ "type": 10,
"name": 'REPEC [CRAWLER]',
"name": 'REPEC [MULTIVAC API]',
"parser": "MultivacParser",
"format": 'JSON',
'file_formats':["zip","json"],
......@@ -263,13 +263,21 @@ RESOURCETYPES = [
},
{ "type": 11,
"name": 'HAL [CRAWLER]',
"name": 'HAL [API]',
"parser": "HalParser",
"format": 'JSON',
'file_formats':["zip","json"],
"crawler": "HalCrawler",
},
{ "type": 12,
"name": 'ISIDORE [SPARQLE API /!\ BETA]',
"parser": "IsidoreParser",
"format": 'JSON',
'file_formats':["zip","json"],
"crawler": "IsidoreCrawler",
},
]
#shortcut for resources declaration in template
PARSERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["parser"] is not None]
......
from django.core.management.base import BaseCommand, CommandError
from gargantext.tools.show_nodes import tree_show, nodes
import colorama
class Command(BaseCommand):
help = 'Nodes'
def add_arguments(self, parser):
parser.add_argument(dest='action', default='show')
def handle(self, *args, **options):
action = options.get('action')
if action == 'show':
colorama.init(strip=False)
for root in nodes():
tree_show(root)
from django.core.management.base import BaseCommand, CommandError
from gargantext.models import Node
class Command(BaseCommand):
help = 'Something'
def handle(self, *args, **options):
self.stdout.write(self.style.SUCCESS('Oh yeah!'))
from .base import Base
from .nodes import *
from .hyperdata import *
from .users import *
......
from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index
from sqlalchemy.orm import relationship
from sqlalchemy.types import TypeDecorator, \
Integer, Float, Boolean, DateTime, String, Text
from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double
from sqlalchemy.ext.mutable import MutableDict, MutableList
from sqlalchemy.ext.declarative import declarative_base
__all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship",
"Integer", "Float", "Boolean", "DateTime", "String", "Text",
"TypeDecorator",
"JSONB", "Double",
"MutableDict", "MutableList",
"Base", "DjangoBase"]
# All the models should derive from this base class, so Base.metadata keeps
# all tables handled by Alembic migration scripts.
Base = declarative_base()
# To be used by tables already handled by Django ORM, such as User model. We
# separate them in order to keep those out of Alembic sight.
DjangoBase = declarative_base()
from gargantext.util.db import *
from gargantext.constants import INDEXED_HYPERDATA
from .base import Base, Column, ForeignKey, TypeDecorator, Index, \
Integer, Double, DateTime, String, Text
from .nodes import Node
import datetime
......@@ -64,6 +65,14 @@ class NodeHyperdata(Base):
)
"""
__tablename__ = 'nodes_hyperdata'
__table_args__ = (
Index('nodes_hyperdata_node_id_value_utc_idx', 'node_id', 'value_utc'),
Index('nodes_hyperdata_node_id_key_value_utc_idx', 'node_id', 'key', 'value_utc'),
Index('nodes_hyperdata_node_id_key_value_str_idx', 'node_id', 'key', 'value_str'),
Index('nodes_hyperdata_node_id_key_value_int_idx', 'node_id', 'key', 'value_int'),
Index('nodes_hyperdata_node_id_key_value_flt_idx', 'node_id', 'key', 'value_flt'),
Index('nodes_hyperdata_node_id_key_idx', 'node_id', 'key'))
id = Column( Integer, primary_key=True )
node_id = Column( Integer, ForeignKey(Node.id, ondelete='CASCADE'))
key = Column( HyperdataKey )
......
from gargantext.util.db import *
from gargantext.util.files import upload
from gargantext.constants import *
from gargantext.util.toolchain.main import parse_extract_indexhyperdata
from datetime import datetime
from .users import User
from .nodes import Node
#__all__ = ['Node', 'NodeType', 'Language']
class NodeType_v2(Base):
__table_args__ = {'extend_existing': True}
__tablename__ = 'node_nodetype'
id = Column(Integer, primary_key=True)
name = Column(String(255))
class Language_v2(Base):
__table_args__ = {'extend_existing': True}
__tablename__ = 'node_language'
id = Column(Integer, primary_key=True)
iso2 = Column(String(2))
iso3 = Column(String(3))
fullname = Column(String(255))
implemented = Column(Boolean)
class Node_v2(Base):
__table_args__ = {'extend_existing': True}
__tablename__ = 'node_node'
id = Column(Integer, primary_key=True)
parent_id = Column(Integer, ForeignKey('node_node.id'))
user_id = Column(Integer, ForeignKey(User.id))
type_id = Column(ForeignKey(NodeType_v2.id))
name = Column(String(255))
language_id = Column(Integer, ForeignKey(Language_v2.id))
date = Column(DateTime(), default=datetime.now)
hyperdata = Column(JSONB, default=dict)
class ResourceType(Base):
__table_args__ = {'extend_existing': True}
__tablename__ = 'node_resourcetype'
id = Column(Integer, primary_key=True)
name = Column(String(255))
class NodeResource(Base):
__table_args__ = {'extend_existing': True}
__tablename__ = 'node_node_resource'
id = Column(Integer, primary_key=True)
node_id = Column(ForeignKey(Node_v2.id))
resource_id = Column(ForeignKey(ResourceType.id))
parsed = Column(Boolean)
def nodes_list(user_id, nodetype, parent_id=None, count=False):
"""
nodes_list :: Int -> String -> Maybe Int -> Maybe Bool -> [(Int, String)]
"""
nodes = ( session.query(Node_v2.id, Node_v2.name)
.join(NodeType_v2, NodeType_v2.id == Node_v2.type_id)
.filter(NodeType_v2.name == nodetype)
)
if parent_id is not None:
nodes = nodes.filter(Node_v2.parent_id == parent_id)
if count is True:
return nodes.count()
else:
return nodes.all()
def nodes_tree(user_id):
"""
nodes_tree :: Int -> Tree Nodes
"""
for project_id, project_name in nodes_list(user_id, 'Project'):
print("* Project (%d, %s)" % (project_id, project_name))
for corpus_id, corpus_name in nodes_list(user_id, 'Corpus', parent_id=project_id):
count = nodes_list( user_id
, 'Document'
, parent_id=corpus_id
, count=True
)
if count > 1:
print("|__ %d %s" % ( corpus_id, corpus_name ))
print(" |___ %s docs" % count)
def copy_nodes(node_id, to_parent_id=None, enabled=['PROJECT', 'CORPUS', 'DOCUMENT']):
node = session.query(Node_v2).filter(Node_v2.id==node_id).first()
nodetype = session.query(NodeType_v2).filter(NodeType_v2.id == node.type_id).first()
resource = (session.query(ResourceType)
.join(NodeResource, NodeResource.resource_id == ResourceType.id)
.filter(NodeResource.node_id == node.id)
.first()
)
nodetype_proj_id = session.query(NodeType_v2.id).filter(NodeType_v2.name == 'Project' ).first()
nodetype_corp_id = session.query(NodeType_v2.id).filter(NodeType_v2.name == 'Corpus' ).first()
nodetype_docu_id = session.query(NodeType_v2.id).filter(NodeType_v2.name == 'Document').first()
typename = nodetype.name.upper()
# Import a project:
# new_project = Node(
# user_id = user.id,
# typename = 'PROJECT',
# name = name,
# )
# session.add(new_project)
# session.commit()
if typename in enabled:
parent_node = session.query(Node).filter(Node.id==to_parent_id).first()
if parent_node is not None:
corpus = parent_node.add_child(
name = node.name,
typename = typename
)
corpus.hyperdata['languages'] = {'fr' : 100}
try:
corpus.add_resource(
type = resourcetype(resource.name)
)
except:
corpus.add_resource(
type = resourcetype('Europress (French)')
)
session.add(corpus)
session.commit()
print("%s copied" % corpus.name)
nodes = (session.query(Node_v2)
.filter(Node_v2.parent_id == node.id)
.filter(Node_v2.type_id == nodetype_docu_id)
.all()
)
for n in nodes:
print(n.name)
doc = corpus.add_child( name = n.name
, typename = "DOCUMENT"
, hyperdata = n.hyperdata
)
session.add(doc)
session.commit()
# else:
# print("%d is None" % parent_id)
else:
print('%s is not enabled' % typename)
from gargantext.util.db import *
from .base import Base, Column, ForeignKey, relationship, Index, \
Integer, Float, String
from .nodes import Node
__all__ = ['Ngram', 'NodeNgram', 'NodeNodeNgram', 'NodeNgramNgram']
......@@ -7,17 +7,39 @@ __all__ = ['Ngram', 'NodeNgram', 'NodeNodeNgram', 'NodeNgramNgram']
class Ngram(Base):
__tablename__ = 'ngrams'
__table_args__ = (
Index('ngrams_id_n_idx', 'id', 'n'),
Index('ngrams_n_idx', 'n'))
id = Column(Integer, primary_key=True)
terms = Column(String(255), unique=True)
n = Column(Integer)
def __str__(self):
return '<{0.terms}>#{0.n}'.format(self)
def __repr__(self):
return '<Ngram(id={0.id}, terms={0.terms!r}, n={0.n})>'.format(self)
class NodeNgram(Base):
__tablename__ = 'nodes_ngrams'
__table_args__ = (
Index('nodes_ngrams_node_id_ngram_id_idx', 'node_id', 'ngram_id'),
Index('nodes_ngrams_node_id_idx', 'node_id'),
Index('nodes_ngrams_ngram_id_idx', 'ngram_id'))
node_id = Column(Integer, ForeignKey(Node.id, ondelete='CASCADE'), primary_key=True)
ngram_id = Column(Integer, ForeignKey(Ngram.id, ondelete='CASCADE'), primary_key=True)
weight = Column(Float)
node = relationship(Node)
ngram = relationship(Ngram)
def __repr__(self):
return '<NodeNgram(node_id={0.node_id}, ngram={0.ngram}, weight={0.weight})>'.format(self)
class NodeNodeNgram(Base):
""" for instance for TFIDF
(
......@@ -28,6 +50,10 @@ class NodeNodeNgram(Base):
)
"""
__tablename__ = 'nodes_nodes_ngrams'
__table_args__ = (
Index('nodes_nodes_ngrams_node2_id_idx', 'node2_id'),
Index('nodes_nodes_ngrams_node1_id_idx', 'node1_id'))
node1_id = Column(Integer, ForeignKey(Node.id, ondelete='CASCADE'), primary_key=True)
node2_id = Column(Integer, ForeignKey(Node.id, ondelete='CASCADE'), primary_key=True)
ngram_id = Column(Integer, ForeignKey(Ngram.id, ondelete='CASCADE'), primary_key=True)
......@@ -36,6 +62,14 @@ class NodeNodeNgram(Base):
# sinon par défaut on aurait un type sql "double_precision" (soit 15 chiffres)
# (cf. www.postgresql.org/docs/9.4/static/datatype-numeric.html#DATATYPE-FLOAT)
node1 = relationship(Node, foreign_keys=[node1_id])
node2 = relationship(Node, foreign_keys=[node2_id])
ngram = relationship(Ngram)
def __repr__(self):
return '<NodeNodeNgram(node1_id={0.node1_id}, node2_id={0.node2_id}, ngram={0.ngram}, score={0.score})>'.format(self)
class NodeNgramNgram(Base):
""" for instance for COOCCURRENCES and GROUPLIST
(
......@@ -46,7 +80,20 @@ class NodeNgramNgram(Base):
)
"""
__tablename__ = 'nodes_ngrams_ngrams'
__table_args__ = (
Index('nodes_ngrams_ngrams_node_id_ngram1_id_ngram2_id_idx', 'node_id', 'ngram1_id', 'ngram2_id'),
Index('nodes_ngrams_ngrams_node_id_idx', 'node_id'),
Index('nodes_ngrams_ngrams_ngram1_id_idx', 'ngram1_id'),
Index('nodes_ngrams_ngrams_ngram2_id_idx', 'ngram2_id'))
node_id = Column(Integer, ForeignKey(Node.id, ondelete='CASCADE'), primary_key=True)
ngram1_id = Column(Integer, ForeignKey(Ngram.id, ondelete='CASCADE'), primary_key=True)
ngram2_id = Column(Integer, ForeignKey(Ngram.id, ondelete='CASCADE'), primary_key=True)
weight = Column(Float(precision=24)) # see comment for NodeNodeNgram.score
node = relationship(Node)
ngram1 = relationship(Ngram, foreign_keys=[ngram1_id])
ngram2 = relationship(Ngram, foreign_keys=[ngram2_id])
def __repr__(self):
return '<NodeNgramNgram(node_id={0.node_id}, ngram1={0.ngram1}, ngram2={0.ngram2}, weight={0.weight})>'.format(self)
from gargantext.util.db import *
from gargantext.util.db import session
from gargantext.util.files import upload
from gargantext.constants import *
from sqlalchemy_utils.types import TSVectorType
from datetime import datetime
from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \
Integer, Float, String, DateTime, JSONB, \
MutableList, MutableDict
from .users import User
__all__ = ['Node', 'NodeNode']
__all__ = ['Node', 'NodeNode', 'CorpusNode']
class NodeType(TypeDecorator):
"""Define a new type of column to describe a Node's type.
......@@ -19,23 +24,69 @@ class NodeType(TypeDecorator):
def process_result_value(self, typeindex, dialect):
return NODETYPES[typeindex]
class Node(Base):
"""This model can fit many purposes.
"""This model can fit many purposes:
myFirstCorpus = session.query(CorpusNode).first()
It intends to provide a generic model, allowing hierarchical structure
and NoSQL-like data structuring.
The possible types are defined in `gargantext.constants.NODETYPES`.
Thanks to __new__ overriding and SQLAlchemy's polymorphism, every Node
instance is automagically casted to its sub-class, assuming a typename
is specified.
>>> Node(name='without-type')
<Node(id=None, typename=None, user_id=None, parent_id=None, name='without-type', date=None)>
>>> Node(typename='CORPUS')
<CorpusNode(id=None, typename='CORPUS', user_id=None, parent_id=None, name=None, date=None)>
>>> from gargantext.util.db import session
>>> session.query(Node).filter_by(typename='USER').first() # doctest: +ELLIPSIS
<UserNode(...)>
But beware, there are some caveats with bulk queries. In this case typename
MUST be specified manually.
>>> session.query(UserNode).delete() # doctest: +SKIP
# Wrong: all nodes are deleted!
>>> session.query(UserNode).filter_by(typename='USER').delete() # doctest: +SKIP
# Right: only user nodes are deleted.
"""
__tablename__ = 'nodes'
__table_args__ = (
Index('nodes_user_id_typename_parent_id_idx', 'user_id', 'typename', 'parent_id'),
Index('nodes_hyperdata_idx', 'hyperdata'))
# TODO
# create INDEX full_text_idx on nodes using gin(to_tsvector('english', hyperdata ->> 'abstract' || 'title'));
id = Column(Integer, primary_key=True)
typename = Column(NodeType, index=True)
__mapper_args__ = { 'polymorphic_on': typename }
# foreign keys
user_id = Column(Integer, ForeignKey(User.id, ondelete='CASCADE'))
parent_id = Column(Integer, ForeignKey('nodes.id', ondelete='CASCADE'))
# main data
user_id = Column(Integer, ForeignKey(User.id, ondelete='CASCADE'))
user = relationship(User)
parent_id = Column(Integer, ForeignKey('nodes.id', ondelete='CASCADE'))
parent = relationship('Node', remote_side=[id])
name = Column(String(255))
date = Column(DateTime(), default=datetime.now)
date = Column(DateTime(timezone=True), default=datetime.now)
hyperdata = Column(JSONB, default=dict)
# metadata (see https://bashelton.com/2014/03/updating-postgresql-json-fields-via-sqlalchemy/)
hyperdata = Column(JSONB, default=dict)
# To make search possible uncomment the line below
#search_vector = Column(TSVectorType('hyperdata'))
def __new__(cls, *args, **kwargs):
if cls is Node and kwargs.get('typename'):
typename = kwargs.pop('typename')
return _NODE_MODELS[typename](*args, **kwargs)
return super(Node, cls).__new__(cls)
def __init__(self, **kwargs):
"""Node's constructor.
......@@ -55,6 +106,11 @@ class Node(Base):
"""
self.hyperdata[key] = value
def __repr__(self):
return '<{0.__class__.__name__}(id={0.id}, typename={0.typename!r}, ' \
'user_id={0.user_id}, parent_id={0.parent_id}, ' \
'name={0.name!r}, date={0.date})>'.format(self)
@property
def ngrams(self):
"""Pseudo-attribute allowing to retrieve a node's ngrams.
......@@ -120,36 +176,6 @@ class Node(Base):
**kwargs
)
def resources(self):
"""Return all the resources attached to a given node.
Mainly used for corpora.
example:
[{'extracted': True,
'path': '/home/me/gargantext/uploads/corpora/0c/0c5b/0c5b50/0c5b50ad8ebdeb2ae33d8e54141a52ee_Corpus_Europresse-Français-2015-12-11.zip',
'type': 1,
'url': None}]
"""
if 'resources' not in self.hyperdata:
self['resources'] = MutableList()
return self['resources']
def add_resource(self, type, path=None, url=None):
"""Attach a resource to a given node.
Mainly used for corpora.
this just adds metadata to the CORPUS node (NOT for adding documents)
example:
{'extracted': True,
'path': '/home/me/gargantext/uploads/corpora/0c/0c5b/0c5b50/0c5b50ad8ebdeb2ae33d8e54141a52ee_Corpus_Europresse-Français-2015-12-11.zip',
'type': 1,
'url': None}
"""
self.resources().append(MutableDict(
{'type': type, 'path':path, 'url':url, 'extracted': False}
))
def status(self, action=None, progress=0, complete=False, error=None):
"""Get or update the status of the given action.
If no action is given, the status of the first uncomplete or last item
......@@ -187,8 +213,86 @@ class Node(Base):
))
return self['statuses'][-1]
class CorpusNode(Node):
__mapper_args__ = {
'polymorphic_identity': 'CORPUS'
}
def resources(self):
"""Return all the resources attached to a given node.
example:
[{'extracted': True,
'path': '/home/me/gargantext/uploads/corpora/0c/0c5b/0c5b50/0c5b50ad8ebdeb2ae33d8e54141a52ee_Corpus_Europresse-Français-2015-12-11.zip',
'type': 1,
'url': None}]
"""
if 'resources' not in self.hyperdata:
self['resources'] = MutableList()
return self['resources']
def add_resource(self, type, path=None, url=None):
"""Attach a resource to a given node.
this just adds metadata to the CORPUS node (NOT for adding documents)
example:
{'extracted': True,
'path': '/home/me/gargantext/uploads/corpora/0c/0c5b/0c5b50/0c5b50ad8ebdeb2ae33d8e54141a52ee_Corpus_Europresse-Français-2015-12-11.zip',
'type': 1,
'url': None}
"""
self.resources().append(MutableDict(
{'type': type, 'path':path, 'url':url, 'extracted': False}
))
class NodeNode(Base):
__tablename__ = 'nodes_nodes'
__table_args__ = (
Index('nodes_nodes_node1_id_node2_id_idx', 'node1_id', 'node2_id'),)
node1_id = Column(Integer, ForeignKey(Node.id, ondelete='CASCADE'), primary_key=True)
node2_id = Column(Integer, ForeignKey(Node.id, ondelete='CASCADE'), primary_key=True)
score = Column(Float(precision=24))
node1 = relationship(Node, foreign_keys=[node1_id])
node2 = relationship(Node, foreign_keys=[node2_id])
def __repr__(self):
return '<NodeNode(node1_id={0.node1_id}, node2_id={0.node2_id}, score={0.score})>'.format(self)
# --8<-- Begin hack ------
# XXX Hack to automatically defines subclasses of Node for every NODETYPES,
# in order to avoid SQLAlchemy complaints -- and subsequent exceptions.
#
# We could manually write a class for every NodeType, or find a way to
# tell SQLAlchemy that it should stick to instantiate a Node when a
# class is not defined for the wanted typename.
_ALREADY_IMPLEMENTED_NODE_TYPES = \
set(cls.__mapper_args__.get('polymorphic_identity') for cls in Node.__subclasses__())
for nodetype in NODETYPES:
if nodetype and nodetype not in _ALREADY_IMPLEMENTED_NODE_TYPES:
# Convert nodetype to a CamelCase class name, assuming it's possible...
class_name = ''.join(nodetype.title().split("-")) + 'Node'
# Create new class and add it to global scope
globals()[class_name] = type(class_name, (Node,), {
"__mapper_args__": {
"polymorphic_identity": nodetype
}
})
# Add class to exports
__all__.append(class_name)
# ------ End of hack ------
_NODE_MODELS = {
mapper.polymorphic_identity: mapper.class_
for mapper in Node.__mapper__.self_and_descendants
if mapper.class_ is not Node
}
from django.contrib.auth import models
from gargantext.util.db import *
from gargantext.util.db import session, aliased
from datetime import datetime
from .base import DjangoBase, Base, Column, ForeignKey, UniqueConstraint, \
Integer, Boolean, DateTime, String
__all__ = ['User', 'Contact']
class User(Base):
class User(DjangoBase):
# The properties below are a reflection of Django's auth module's models.
__tablename__ = models.User._meta.db_table
id = Column(Integer, primary_key=True)
......@@ -60,7 +63,7 @@ class User(Base):
"""check if a given node is owned by the user"""
return (node.user_id == self.id) or \
node.id in (contact.id for contact in self.contacts())
def get_params(self, username=None):
print(self.__dict__.items())
return self.hyperdata
......
"""Define ReplaceableObject and related operations
Implements operations to create/drop SQL objects such as views, stored
procedures and triggers that can't be "altered" but can be replaced -- hence
the name of "ReplaceableObject" class.
This recipe is directly borrowed from Alembic documentation, see
http://alembic.zzzcomputing.com/en/latest/cookbook.html#replaceable-objects
"""
from alembic.operations import Operations, MigrateOperation
__all__ = ['ReplaceableObject']
class ReplaceableObject(object):
def __init__(self, name, sqltext):
self.name = name
self.sqltext = sqltext
class ReversibleOp(MigrateOperation):
def __init__(self, target):
self.target = target
@classmethod
def invoke_for_target(cls, operations, target):
op = cls(target)
return operations.invoke(op)
def reverse(self):
raise NotImplementedError()
@classmethod
def _get_object_from_version(cls, operations, ident):
version, objname = ident.split(".")
module = operations.get_context().script.get_revision(version).module
obj = getattr(module, objname)
return obj
@classmethod
def replace(cls, operations, target, replaces=None, replace_with=None):
if replaces:
old_obj = cls._get_object_from_version(operations, replaces)
drop_old = cls(old_obj).reverse()
create_new = cls(target)
elif replace_with:
old_obj = cls._get_object_from_version(operations, replace_with)
drop_old = cls(target).reverse()
create_new = cls(old_obj)
else:
raise TypeError("replaces or replace_with is required")
operations.invoke(drop_old)
operations.invoke(create_new)
@Operations.register_operation("create_view", "invoke_for_target")
@Operations.register_operation("replace_view", "replace")
class CreateViewOp(ReversibleOp):
def reverse(self):
return DropViewOp(self.target)
@Operations.register_operation("drop_view", "invoke_for_target")
class DropViewOp(ReversibleOp):
def reverse(self):
return CreateViewOp(self.view)
@Operations.register_operation("create_sp", "invoke_for_target")
@Operations.register_operation("replace_sp", "replace")
class CreateSPOp(ReversibleOp):
def reverse(self):
return DropSPOp(self.target)
@Operations.register_operation("drop_sp", "invoke_for_target")
class DropSPOp(ReversibleOp):
def reverse(self):
return CreateSPOp(self.target)
@Operations.implementation_for(CreateViewOp)
def create_view(operations, operation):
operations.execute("CREATE VIEW %s AS %s" % (
operation.target.name,
operation.target.sqltext
))
@Operations.implementation_for(DropViewOp)
def drop_view(operations, operation):
operations.execute("DROP VIEW %s" % operation.target.name)
@Operations.implementation_for(CreateSPOp)
def create_sp(operations, operation):
operations.execute(
"CREATE FUNCTION %s %s" % (
operation.target.name, operation.target.sqltext
)
)
@Operations.implementation_for(DropSPOp)
def drop_sp(operations, operation):
operations.execute("DROP FUNCTION %s" % operation.target.name)
# Make this a standalone script...
# Can be called this way: python3 gargantext/tools/show_nodes.py
import os
import django
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
django.setup()
# ...End of jiberish.
import itertools
import colorama
from colorama import Fore
from sqlalchemy.sql.expression import literal_column
from gargantext.util.db import session, func, aliased
from gargantext.models import Node
NODE_BULLET = '‣'
# https://en.wikipedia.org/wiki/Box-drawing_character
TREE_ROOT = '╾'
TREE_VERT = '│'
TREE_HORI = '─'
TREE_FORK = '├'
TREE_CORN = '└'
FIRST = 0x01
LAST = 0x02
def nodes(parent=None, group_by='typename', order_by='typename', has_child='check'):
if group_by or has_child is not None:
select = [func.min(Node.id).label('id'),
func.min(Node.name).label('name'),
func.min(Node.typename).label('typename'),
func.count(Node.id).label('cnt')]
else:
select = [Node.id.label('id'),
Node.name.label('name'),
Node.typename.label('typename'),
literal_column('1').label('cnt')]
if has_child is not None:
N = aliased(Node)
select.append(func.count(N.id).label('children'))
else:
select.append(literal_column('NULL').label('children'))
parent_id = getattr(parent, 'id', parent)
q = session.query(*select).filter_by(parent_id=parent_id) \
.group_by(getattr(Node, group_by if group_by else 'id'))
if has_child is not None:
q = q.outerjoin(N, N.parent_id == Node.id).group_by(N.parent_id)
return q.order_by(order_by)
def node_show(node, prefix='', maxlen=60):
if node.children > 0 or node.cnt == 1:
name = node.name[:maxlen] + '..' if len(node.name) > maxlen else node.name
label = Fore.CYAN + name + Fore.RESET
else:
label = Fore.MAGENTA + str(node.cnt) + Fore.RESET
print(prefix, '%s%s %s' % (Fore.GREEN, node.typename, label), sep='')
def tree_show(node, pos=FIRST|LAST, level=0, prefix='', maxlen=60, compact=True):
#print('%02d %x' % (level, pos), end='')
branch = TREE_ROOT if pos&FIRST and level == 0 else TREE_FORK if not pos&LAST else TREE_CORN
node_prefix = prefix + branch + 2*TREE_HORI + ' '
node_show(node, node_prefix, maxlen)
childs = iter(nodes(parent=node, group_by=compact and 'typename'))
try:
node = next(childs)
except StopIteration:
return
prefix = prefix + (' ' if pos&LAST else TREE_VERT) + ' '
for i, next_node in enumerate(itertools.chain(childs, [None])):
pos = (FIRST if i == 0 else 0) | (LAST if next_node is None else 0)
tree_show(node, pos, level + 1, prefix, maxlen, compact)
node = next_node
if __name__ == "__main__":
import sys
if len(sys.argv) == 1:
compact = True
elif len(sys.argv) == 2 and sys.argv[1] in ('-a', '--all'):
compact = False
else:
print("Usage: %s [-a|--all]" % sys.argv[0], file=sys.stderr)
sys.exit(1)
colorama.init(strip=False)
for root in nodes():
tree_show(root, compact=compact)
......@@ -53,6 +53,7 @@ class HalCrawler(Crawler):
, deptStructId_i
, labStructId_i
, rteamStructId_i
, docType_s
"""
#, authUrl_s
#, type_s
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** ISIDORE Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Crawler import *
import json
from gargantext.constants import UPLOAD_DIRECTORY
from math import trunc
from gargantext.util.files import save
from gargantext.util.crawlers.sparql.bool2sparql import bool2sparql, isidore
class IsidoreCrawler(Crawler):
''' ISIDORE SPARQL API CLIENT'''
def __init__(self):
# Main EndPoints
self.BASE_URL = "https://www.rechercheisidore.fr"
self.API_URL = "sparql"
# Final EndPoints
# TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL
self.status = []
def __format_query__(self, query=None, count=False, offset=None, limit=None):
'''formating the query'''
return (bool2sparql(query, count=count, offset=offset, limit=limit))
def _get(self, query, offset=0, limit=None, lang=None):
'''Parameters to download data'''
isidore(query, count=False, offset=offset, limit=limit)
def scan_results(self, query):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self.results_nb = [n for n in isidore(query, count=True)][0]
return self.results_nb
def download(self, query):
downloaded = False
self.status.append("fetching results")
corpus = []
limit = 1000
self.query_max = self.scan_results(query)
print("self.query_max : %s" % self.query_max)
if self.query_max > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % ( self.query_max
, QUERY_SIZE_N_MAX
)
print("WARNING (scrap: ISIDORE d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX
for offset in range(0, self.query_max, limit):
print("Downloading result %s to %s" % (offset, self.query_max))
for doc in isidore(query, offset=offset, limit=limit) :
corpus.append(doc)
self.path = save( json.dumps(corpus).encode("utf-8")
, name='ISIDORE.json'
, basedir=UPLOAD_DIRECTORY
)
downloaded = True
return downloaded
import subprocess
import re
from .sparql import Service
#from sparql import Service
def bool2sparql(rawQuery, count=False, offset=None, limit=None):
"""
bool2sparql :: String -> Bool -> Int -> String
Translate a boolean query into a Sparql request
You need to build bool2sparql binaries before
See: https://github.com/delanoe/bool2sparql
"""
query = re.sub("\"", "\'", rawQuery)
bashCommand = ["/srv/gargantext/gargantext/util/crawlers/sparql/bool2sparql-exe","-q",query]
if count is True :
bashCommand.append("-c")
else :
if offset is not None :
for command in ["--offset", str(offset)] :
bashCommand.append(command)
if limit is not None :
for command in ["--limit", str(limit)] :
bashCommand.append(command)
process = subprocess.Popen(bashCommand, stdout=subprocess.PIPE)
output, error = process.communicate()
if error is not None :
raise(error)
else :
print(output)
return(output.decode("utf-8"))
def isidore(query, count=False, offset=None, limit=None):
"""
isidore :: String -> Bool -> Int -> Either (Dict String) Int
use sparql-client either to search or to scan
"""
query = bool2sparql(query, count=count, offset=offset, limit=limit)
go = Service("https://www.rechercheisidore.fr/sparql/", "utf-8", "GET")
results = go.query(query)
if count is False:
for r in results:
doc = dict()
doc_values = dict()
doc["url"], doc["title"], doc["date"], doc["abstract"], doc["source"] = r
for k in doc.keys():
doc_values[k] = doc[k].value
yield(doc_values)
else :
count = []
for r in results:
n, = r
count.append(int(n.value))
yield count[0]
def test():
query = "delanoe"
limit = 100
offset = 10
for d in isidore(query, offset=offset, limit=limit):
print(d["date"])
#print([n for n in isidore(query, count=True)])
if __name__ == '__main__':
test()
This diff is collapsed.
......@@ -5,16 +5,15 @@ from gargantext.util.json import json_dumps
########################################################################
# get engine, session, etc.
########################################################################
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import delete
from sqlalchemy_searchable import make_searchable
def get_engine():
from sqlalchemy import create_engine
url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{NAME}'.format(
**settings.DATABASES['default']
)
return create_engine( url
return create_engine( settings.DATABASES['default']['URL']
, use_native_hstore = True
, json_serializer = json_dumps
, pool_size=20, max_overflow=0
......@@ -22,20 +21,16 @@ def get_engine():
engine = get_engine()
# To make Full Text search possible, uncomment lines below
# https://sqlalchemy-searchable.readthedocs.io/
#sa.orm.configure_mappers()
Base = declarative_base()
#Base.metadata.create_all(engine)
#make_searchable()
session = scoped_session(sessionmaker(bind=engine))
########################################################################
# tools to build models
########################################################################
from sqlalchemy.types import *
from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint
from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION
from sqlalchemy.ext.mutable import MutableDict, MutableList
Double = DOUBLE_PRECISION
########################################################################
# useful for queries
########################################################################
......
......@@ -10,7 +10,7 @@ __all__ = ['json_encoder', 'json_dumps']
class JSONEncoder(json.JSONEncoder):
def default(self, obj):
from gargantext.util.db import Base
from gargantext.models import Base
if isinstance(obj, Base):
return {
key: value
......
......@@ -3,7 +3,7 @@
# ****************************
# **** HAL Parser ***
# ****************************
# CNRS COPYRIGHTS
# CNRS COPYRIGHTS 2017
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Parser import Parser
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** ISIDORE Parser ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Parser import Parser
from datetime import datetime
import json
class IsidoreParser(Parser):
def parse(self, filebuf):
'''
parse :: FileBuff -> [Hyperdata]
'''
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
filebuf.close()
json_docs = data
hyperdata_list = []
hyperdata_path = { "title" : "title"
, "abstract" : "abstract"
, "authors" : "authors"
, "url" : "url"
, "source" : "source"
}
uniq_id = set()
for doc in json_docs:
hyperdata = {}
for key, path in hyperdata_path.items():
hyperdata[key] = doc.get(path, "")
if hyperdata["url"] not in uniq_id:
# Removing the duplicates implicitly
uniq_id.add(hyperdata["url"])
# Source is the Journal Name
hyperdata["source"] = doc.get("source", "ISIDORE Database")
# Working on the date
maybeDate = doc.get("date" , None)
if maybeDate is None:
date = datetime.now()
else:
try :
# Model of date: 1958-01-01T00:00:00
date = datetime.strptime(maybeDate, '%Y-%m-%dT%H:%M:%S')
except :
print("FIX DATE ISIDORE please >%s<" % maybeDate)
date = datetime.now()
hyperdata["publication_date"] = date
hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(date.day)
hyperdata_list.append(hyperdata)
return hyperdata_list
......@@ -12,15 +12,19 @@ from gargantext.constants import DEFAULT_MAPLIST_MAX,\
DEFAULT_MAPLIST_GENCLUSION_RATIO,\
DEFAULT_MAPLIST_MONOGRAMS_RATIO
def do_maplist_query():
return None
def do_maplist(corpus,
overwrite_id = None,
mainlist_id = None,
specclusion_id = None,
genclusion_id = None,
grouplist_id = None,
limit=DEFAULT_MAPLIST_MAX,
genclusion_part=DEFAULT_MAPLIST_GENCLUSION_RATIO,
monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO
overwrite_id = None,
mainlist_id = None,
specclusion_id = None,
genclusion_id = None,
grouplist_id = None,
limit = DEFAULT_MAPLIST_MAX,
genclusion_part = DEFAULT_MAPLIST_GENCLUSION_RATIO,
monograms_part = DEFAULT_MAPLIST_MONOGRAMS_RATIO
):
'''
According to Genericity/Specificity and mainlist
......@@ -28,9 +32,9 @@ def do_maplist(corpus,
Parameters:
- mainlist_id (starting point, already cleaned of stoplist terms)
- specclusion_id (ngram inclusion by cooc specificity -- ranking factor)
- genclusion_id (ngram inclusion by cooc genericity -- ranking factor)
- genclusion_id (ngram inclusion by cooc genericity -- ranking factor)
- grouplist_id (filtering grouped ones)
- overwrite_id: optional if preexisting MAPLIST node to overwrite
- overwrite_id: optional. Overwrite if preexisting MAPLIST node
+ 3 params to modulate the terms choice
- limit for the amount of picked terms
......@@ -77,6 +81,7 @@ def do_maplist(corpus,
)
.join(Ngram, Ngram.id == ScoreSpec.ngram_id)
.join(ScoreGen, ScoreGen.ngram_id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specclusion_id)
.filter(ScoreGen.node_id == genclusion_id)
......@@ -155,10 +160,10 @@ def do_maplist(corpus,
# at the end of the first loop we just need to sort all by the second ranker (gen)
scored_ngrams = sorted(scored_ngrams, key=lambda ng_infos: ng_infos[2], reverse=True)
obtained_spec_mono = len(chosen_ngrams['topspec']['monograms'])
obtained_spec_mono = len(chosen_ngrams['topspec']['monograms'])
obtained_spec_multi = len(chosen_ngrams['topspec']['multigrams'])
obtained_gen_mono = len(chosen_ngrams['topgen']['monograms'])
obtained_gen_multi = len(chosen_ngrams['topgen']['multigrams'])
obtained_gen_mono = len(chosen_ngrams['topgen']['monograms'])
obtained_gen_multi = len(chosen_ngrams['topgen']['multigrams'])
obtained_total = obtained_spec_mono \
+ obtained_spec_multi \
+ obtained_gen_mono \
......
......@@ -175,7 +175,6 @@ def parse(corpus):
hyperdata = hyperdata,
)
session.add(document)
session.commit()
documents_count += 1
if pending_add_error_stats:
......@@ -190,6 +189,9 @@ def parse(corpus):
session.add(corpus)
session.commit()
# Commit any pending document
session.commit()
# update info about the resource
resource['extracted'] = True
#print( "resource n°",i, ":", d, "docs inside this file")
......
#!/bin/bash
### Update and install base dependencies
echo "############ DEBIAN LIBS ###############"
apt-get update && \
......@@ -32,26 +34,27 @@ update-locale LC_ALL=fr_FR.UTF-8
libxml2-dev xml-core libgfortran-6-dev \
libpq-dev \
python3.5 \
python3-dev \
python3.5-dev \
python3-six python3-numpy python3-setuptools \
python3-numexpr \
python3-pip \
libxml2-dev libxslt-dev zlib1g-dev
libxml2-dev libxslt-dev zlib1g-dev libigraph0-dev
#libxslt1-dev
UPDATE AND CLEAN
# UPDATE AND CLEAN
apt-get update && apt-get autoclean
#NB: removing /var/lib will avoid to significantly fill up your /var/ folder on your native system
########################################################################
### PYTHON ENVIRONNEMENT (as ROOT)
########################################################################
#adduser --disabled-password --gecos "" gargantua
cd /srv/
pip3 install virtualenv
virtualenv /srv/env_3-5
virtualenv /srv/env_3-5 -p /usr/bin/python3.5
echo '/srv/gargantext' > /srv/env_3-5/lib/python3.5/site-packages/gargantext.pth
echo 'alias venv="source /srv/env_3-5/bin/activate"' >> ~/.bashrc
# CONFIG FILES
......@@ -60,9 +63,9 @@ update-locale LC_ALL=fr_FR.UTF-8
source /srv/env_3-5/bin/activate && pip3 install -r /srv/gargantext/install/gargamelle/requirements.txt && \
pip3 install git+https://github.com/zzzeek/sqlalchemy.git@rel_1_1 && \
python3 -m nltk.downloader averaged_perceptron_tagger -d /usr/local/share/nltk_data
chown gargantua:gargantua -R /srv/env_3-5
#######################################################################
## POSTGRESQL DATA (as ROOT)
#######################################################################
......
......@@ -15,9 +15,9 @@ RUN apt-get update && \
apt-utils ca-certificates locales \
sudo aptitude gcc g++ wget git vim \
build-essential make \
postgresql-9.5 postgresql-client-9.5 postgresql-contrib-9.5 \
postgresql-server-dev-9.5 libpq-dev libxml2 \
postgresql-9.5 postgresql-client-9.5 postgresql-contrib-9.5
postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6 \
postgresql-server-dev-9.6 libpq-dev libxml2 \
postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6
### Configure timezone and locale
......@@ -37,7 +37,7 @@ ENV LC_ALL fr_FR.UTF-8
### Install main dependencies and python packages based on Debian distrib
RUN echo "############# PYTHON DEPENDENCIES ###############"
RUN apt-get update && apt-get install -y \
libxml2-dev xml-core libgfortran-5-dev \
libxml2-dev xml-core libgfortran-6-dev \
libpq-dev \
python3.5 \
python3-dev \
......@@ -47,8 +47,8 @@ RUN apt-get update && apt-get install -y \
# python dependencies
python3-pip \
# for lxml
libxml2-dev libxslt-dev
#libxslt1-dev zlib1g-dev
libxml2-dev libxslt-dev \
libxslt1-dev zlib1g-dev
# UPDATE AND CLEAN
RUN apt-get update && apt-get autoclean &&\
......
......@@ -14,7 +14,7 @@ echo "::::: DJANGO :::::"
/bin/su gargantua -c 'source /env_3-5/bin/activate &&\
su gargantua -c 'source /srv/env_3-5/bin/activate &&\
echo "Activated env" &&\
/srv/gargantext/manage.py makemigrations &&\
/srv/gargantext/manage.py migrate && \
......@@ -24,4 +24,4 @@ echo "::::: DJANGO :::::"
/srv/gargantext/dbmigrate.py && \
/srv/gargantext/manage.py createsuperuser'
/usr/sbin/service postgresql stop
service postgresql stop
##
# You should look at the following URL's in order to grasp a solid understanding
# of Nginx configuration files in order to fully unleash the power of Nginx.
# http://wiki.nginx.org/Pitfalls
# http://wiki.nginx.org/QuickStart
# http://wiki.nginx.org/Configuration
#
# Generally, you will want to move this file somewhere, and start with a clean
# file but keep this around for reference. Or just disable in sites-enabled.
#
# Please see /usr/share/doc/nginx-doc/examples/ for more detailed examples.
##
# the upstream component nginx needs to connect to
upstream gargantext {
server unix:///tmp/gargantext.sock; # for a file socket
#server 127.0.0.1:8001; # for a web port socket (we'll use this first)
}
# Default server configuration
#
server {
listen 80 default_server;
listen [::]:80 default_server;
# SSL configuration
#
# listen 443 ssl default_server;
# listen [::]:443 ssl default_server;
#
# Note: You should disable gzip for SSL traffic.
# See: https://bugs.debian.org/773332
#
# Read up on ssl_ciphers to ensure a secure configuration.
# See: https://bugs.debian.org/765782
#
# Self signed certs generated by the ssl-cert package
# Don't use them in a production server!
#
# include snippets/snakeoil.conf;
client_max_body_size 800M;
client_body_timeout 12;
client_header_timeout 12;
keepalive_timeout 15;
send_timeout 10;
root /var/www/html;
# Add index.php to the list if you are using PHP
#index index.html index.htm index.nginx-debian.html;
server_name _ localhost ;
# Django media
location /media {
alias /srv/gargantext_media; # your Django project's media files - amend as required
}
location /static {
alias /srv/gargantext_static; # your Django project's static files - amend as required
}
# Finally, send all non-media requests to the Django server.
location / {
uwsgi_pass gargantext;
include uwsgi_params;
}
#access_log off;
access_log /var/log/nginx/access.log;
error_log /var/log/nginx/error.log;
}
-- ____
-- / ___|
-- | | _
-- | |_| |
-- \____|arganTexT
----------------------------------------------------------------------
-- Gargantext optimization of Database --
----------------------------------------------------------------------
--> Manual optimization with indexes according to usages
-- Weakness and Strengths of indexes:
--> it can slow down the insertion(s)
--> it can speed up the selection(s)
--> Conventions for this document:
--> indexes commented already have been created
--> indexes not commented have not been created yet
----------------------------------------------------------------------
-- Retrieve Nodes
----------------------------------------------------------------------
create INDEX on nodes (user_id, typename, parent_id) ;
create INDEX on nodes_hyperdata (node_id, key);
create INDEX on ngrams (id, n) ;
create INDEX on ngrams (n) ;
create INDEX on nodes_ngrams (node_id, ngram_id) ;
create INDEX on nodes_ngrams (node_id) ;
create INDEX on nodes_ngrams (ngram_id) ;
create INDEX on nodes_ngrams_ngrams (node_id, ngram1_id, ngram2_id) ;
create INDEX on nodes_ngrams_ngrams (node_id) ;
create INDEX on nodes_ngrams_ngrams (ngram1_id) ;
create INDEX on nodes_ngrams_ngrams (ngram2_id) ;
----------------------------------------------------------------------
-- DELETE optimization of Nodes -- todo on dev
create INDEX on nodes_nodes_ngrams (node1_id);
create INDEX on nodes_nodes_ngrams (node2_id);
create INDEX on nodes_nodes (node1_id, node2_id);
-- Maybe needed soon:
-- create INDEX on nodes_nodes_ngrams (node1_id, node2_id);
----------------------------------------------------------------------
-- Analytics
create INDEX on nodes_hyperdata (node_id,value_utc); -- remove ?
create INDEX on nodes_hyperdata (node_id,key,value_utc);
create INDEX on nodes_hyperdata (node_id,key,value_int);
create INDEX on nodes_hyperdata (node_id,key,value_flt);
create INDEX on nodes_hyperdata (node_id,key,value_str);
----------------------------------------------------------------------
----------------------------------------------------------------------
create index on nodes using GIN (hyperdata);
----------------------------------------------------------------------
# try bottleneck
eventlet==0.20.1
amqp==1.4.9
anyjson==0.3.3
billiard==3.3.0.23
......@@ -32,3 +33,5 @@ lxml==3.5.0
requests-futures==0.9.7
bs4==0.0.1
requests==2.10.0
alembic>=0.9.2
# SQLAlchemy-Searchable==0.10.4
#!/bin/bash
sudo adduser --disabled-password --gecos "" notebooks
sudo docker rm $(sudo docker ps -a | grep sh | awk '{print $1}')
sudo docker build -t garg-notebook:latest ./notebook
#!/bin/bash
#-v /srv/gargandata:/srv/gargandata \
#-v /srv/gargantext_lib:/srv/gargantext_lib \
sudo docker rm $(sudo docker ps -a | grep notebook | grep sh | awk '{print $1}')
sudo docker run \
--name=garg-notebook \
-v /srv/gargantext:/srv/gargantext \
-p 8899:8899 \
-it garg-notebook:latest \
/bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser --notebook-dir=/home/notebooks/'"
#/bin/bash -c "/bin/su gargantua -c 'source /env_3-5/bin/activate && jupyter notebook --port=8899 --ip=127.0.0.1 --no-browser'"
###########################################################
# Gargamelle WEB
###########################################################
#Build an image starting with debian:stretch image
# wich contains all the source code of the app
FROM debian:stretch
MAINTAINER ISCPIF <gargantext@iscpif.fr>
USER root
### Update and install base dependencies
RUN echo "############ DEBIAN LIBS ###############"
RUN apt-get update && \
apt-get install -y \
apt-utils ca-certificates locales \
sudo aptitude gcc g++ wget git vim \
build-essential make \
curl \
postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6 \
postgresql-server-dev-9.6 libpq-dev libxml2 \
postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6
# Install Stack
### Configure timezone and locale
RUN echo "########### LOCALES & TZ #################"
RUN echo "Europe/Paris" > /etc/timezone
ENV TZ "Europe/Paris"
RUN sed -i -e 's/# en_GB.UTF-8 UTF-8/en_GB.UTF-8 UTF-8/' /etc/locale.gen && \
sed -i -e 's/# fr_FR.UTF-8 UTF-8/fr_FR.UTF-8 UTF-8/' /etc/locale.gen && \
dpkg-reconfigure --frontend=noninteractive locales && \
echo 'LANG="fr_FR.UTF-8"' > /etc/default/locale
ENV LANG fr_FR.UTF-8
ENV LANGUAGE fr_FR.UTF-8
ENV LC_ALL fr_FR.UTF-8
### Install main dependencies and python packages based on Debian distrib
RUN echo "############# PYTHON DEPENDENCIES ###############"
RUN apt-get update && apt-get install -y \
libxml2-dev xml-core libgfortran-6-dev \
libpq-dev \
python3.5 \
python3-dev \
# for numpy, pandas and numpyperf \
python3-six python3-numpy python3-setuptools \
python3-numexpr \
# python dependencies \
python3-pip \
# for lxml
libxml2-dev libxslt-dev libxslt1-dev zlib1g-dev
# UPDATE AND CLEAN
RUN apt-get update && apt-get autoclean \
&& rm -rf /var/lib/apt/lists/*
#NB: removing /var/lib will avoid to significantly fill up your /var/ folder on your native system
########################################################################
### PYTHON ENVIRONNEMENT (as ROOT)
########################################################################
RUN adduser --disabled-password --gecos "" notebooks
RUN pip3 install virtualenv
RUN virtualenv /env_3-5
RUN echo 'alias venv="source /env_3-5/bin/activate"' >> ~/.bashrc
# CONFIG FILES
ADD requirements.txt /
ADD psql_configure.sh /
ADD django_configure.sh /
RUN . /env_3-5/bin/activate && pip3 install -r requirements.txt && \
pip3 install git+https://github.com/zzzeek/sqlalchemy.git@rel_1_1 && \
python3 -m nltk.downloader averaged_perceptron_tagger -d /usr/local/share/nltk_data
#RUN ./psql_configure.sh
#RUN ./django_configure.sh
RUN chown notebooks:notebooks -R /env_3-5
########################################################################
### Notebook IHaskell and IPYTHON ENVIRONNEMENT
########################################################################
RUN apt-get update && apt-get install -y \
libtinfo-dev \
libzmq3-dev \
libcairo2-dev \
libpango1.0-dev \
libmagic-dev \
libblas-dev \
liblapack-dev
RUN curl -sSL https://get.haskellstack.org/ | sh
RUN stack setup
RUN git clone https://github.com/gibiansky/IHaskell
RUN . /env_3-5/bin/activate \
&& cd IHaskell \
&& stack install gtk2hs-buildtools \
&& stack install --fast \
&& /root/.local/bin/ihaskell install --stack
#
########################################################################
### POSTGRESQL DATA (as ROOT)
########################################################################
#RUN sed -iP "s%^data_directory.*%data_directory = \'\/srv\/gargandata\'%" /etc/postgresql/9.5/main/postgresql.conf
#RUN echo "host all all 0.0.0.0/0 md5" >> /etc/postgresql/9.5/main/pg_hba.conf
#RUN echo "listen_addresses='*'" >> /etc/postgresql/9.5/main/postgresql.conf
EXPOSE 8899
VOLUME ["/srv/","/home/notebooks/"]
#!/bin/bash
##################################################
# __| |(_) __ _ _ __ __ _ ___
# / _` || |/ _` | '_ \ / _` |/ _ \
# | (_| || | (_| | | | | (_| | (_) |
# \__,_|/ |\__,_|_| |_|\__, |\___/
# |__/ |___/
##################################################
#configure django migrations
##################################################
echo "::::: DJANGO :::::"
#echo "Starting Postgres"
#/usr/sbin/service postgresql start
su gargantua -c 'source /srv/env_3-5/bin/activate &&\
echo "Activated env" &&\
/srv/gargantext/manage.py makemigrations &&\
/srv/gargantext/manage.py migrate && \
echo "migrations ok" &&\
/srv/gargantext/dbmigrate.py && \
/srv/gargantext/dbmigrate.py && \
/srv/gargantext/dbmigrate.py && \
/srv/gargantext/manage.py createsuperuser'
service postgresql stop
"""
Gargantext Software Copyright (c) 2016 CNRS ISC-PIF -
http://iscpif.fr
Licence (see :
http://gitlab.iscpif.fr/humanities/gargantext/blob/stable/LICENSE )
- In France : a CECILL variant affero compliant
- GNU aGPLV3 for all other countries
"""
#!/usr/bin/env python
import sys
import os
# Django settings
dirname = os.path.dirname(os.path.realpath(__file__))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext.settings")
# initialize Django application
from django.core.wsgi import get_wsgi_application
application = get_wsgi_application()
from gargantext.util.toolchain.main import parse_extract_indexhyperdata
from gargantext.util.db import *
from gargantext.models import Node
from nltk.tokenize import wordpunct_tokenize
from gargantext.models import *
from nltk.tokenize import word_tokenize
import nltk as nltk
from statistics import mean
from math import log
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import datetime
from collections import Counter
from langdetect import detect as detect_lang
def documents(corpus_id):
return (session.query(Node).filter( Node.parent_id==corpus_id
, Node.typename=="DOCUMENT"
)
# .order_by(Node.hyperdata['publication_date'])
.all()
)
import seaborn as sns
import pandas as pd
def chart(docs, field):
year_publis = list(Counter([doc.hyperdata[field] for doc in docs]).items())
frame0 = pd.DataFrame(year_publis, columns=['Date', 'DateValue'])
frame1 = pd.DataFrame(year_publis, columns=['Date', 'DateValue'], index=frame0.Date)
return frame1
from gargantext.util.crawlers.HAL import HalCrawler
def scan_hal(request):
hal = HalCrawler()
return hal.scan_results(request)
def scan_gargantext(corpus_id, lang, request):
connection = get_engine().connect()
# TODO add some sugar the request (ideally request should be the same for hal and garg)
query = """select count(n.id) from nodes n
where to_tsvector('%s', hyperdata ->> 'abstract' || 'title')
@@ to_tsquery('%s')
AND n.parent_id = %s;""" % (lang, request, corpus_id)
return [i for i in connection.execute(query)][0][0]
connection.close()
#!/bin/bash
#######################################################################
## ____ _
## | _ \ ___ ___| |_ __ _ _ __ ___ ___
## | |_) / _ \/ __| __/ _` | '__/ _ \/ __|
## | __/ (_) \__ \ || (_| | | | __/\__ \
## |_| \___/|___/\__\__, |_| \___||___/
## |___/
#######################################################################
echo "::::: POSTGRESQL :::::"
su postgres -c 'pg_dropcluster 9.4 main --stop'
#done in docker but redoing it
rm -rf /srv/gargandata && mkdir /srv/gargandata && chown postgres:postgres /srv/gargandata
su postgres -c '/usr/lib/postgresql/9.6/bin/initdb -D /srv/gargandata/'
su postgres -c '/usr/lib/postgresql/9.6/bin/pg_ctl -D /srv/gargandata/ -l /srv/gargandata/journal_applicatif start'
su postgres -c 'pg_createcluster -D /srv/gargandata 9.6 main '
su postgres -c 'pg_ctlcluster -D /srv/gargandata 9.6 main start '
su postgres -c 'pg_ctlcluster 9.6 main start'
service postgresql start
su postgres -c "psql -c \"CREATE user gargantua WITH PASSWORD 'C8kdcUrAQy66U'\""
su postgres -c "createdb -O gargantua gargandb"
echo "Postgres configured"
#service postgresql stop
# try bottleneck
eventlet==0.20.1
amqp==1.4.9
anyjson==0.3.3
billiard==3.3.0.23
celery==3.1.25
chardet==2.3.0
dateparser==0.3.5
Django==1.10.5
django-celery==3.2.1
django-pgfields==1.4.4
django-pgjsonb==0.0.23
djangorestframework==3.5.3
html5lib==0.9999999
#python-igraph>=0.7.1
jdatetime==1.7.2
kombu==3.0.37 # messaging
langdetect==1.0.6 #detectinglanguage
nltk==3.1
numpy==1.10.4
psycopg2==2.6.2
pycountry==1.20
python-dateutil==2.4.2
pytz==2016.10 # timezones
PyYAML==3.11
RandomWords==0.1.12
ujson==1.35
umalqurra==0.2 # arabic calendars (?? why use ??)
networkx==1.11
pandas==0.18.0
six==1.10.0
lxml==3.5.0
requests-futures==0.9.7
bs4==0.0.1
requests==2.10.0
djangorestframework-jwt==1.9.0
jupyter==1.0.0
jupyter-client==5.0.0
jupyter-console==5.1.0
jupyter-core==4.3.0
ipython==5.2.0
ipython-genutils==0.1.0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** ISIDORE Crawler *****
# ****************************
RESOURCE_TYPE_ISIDORE = 12
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
def query( request):
'''get GlobalResults()'''
if request.method == "POST":
query = request.POST["query"]
source = get_resource(RESOURCE_TYPE_ISIDORE)
if source["crawler"] is not None:
crawlerbot = load_crawler(source)()
#old raw way to get results_nb
results = crawlerbot.scan_results(query)
#ids = crawlerbot.get_ids(query)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
'''save'''
if request.method == "POST":
query = request.POST.get("query")
try:
N = int(request.POST.get("N"))
except:
N = 0
print(query, N)
#for next time
#ids = request.POST["ids"]
source = get_resource(RESOURCE_TYPE_ISIDORE)
if N == 0:
raise Http404()
if N > QUERY_SIZE_N_MAX:
N = QUERY_SIZE_N_MAX
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
return HttpResponseForbidden()
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : "fr"
}
)
#download_file
crawler_bot = load_crawler(source)()
#for now no way to force downloading X records
#the long running command
filename = crawler_bot.download(query)
corpus.add_resource(
type = source["type"]
#, name = source["name"]
, path = crawler_bot.path
)
session.add(corpus)
session.commit()
#corpus_id = corpus.id
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
......@@ -10,19 +10,15 @@
# moissonneurs == getting data from external databases
# Available databases :
## Pubmed
## IsTex,
## CERN
from django.conf.urls import url
# Available databases :
import moissonneurs.pubmed as pubmed
import moissonneurs.istex as istex
import moissonneurs.cern as cern
import moissonneurs.multivac as multivac
import moissonneurs.hal as hal
import moissonneurs.isidore as isidore
# TODO : ISIDORE
......@@ -42,7 +38,7 @@ urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^hal/query$' , hal.query )
, url(r'^hal/save/(\d+)' , hal.save )
#, url(r'^isidore/query$' , isidore.query )
#, url(r'^isidore/save/(\d+)' , isidore.save )
, url(r'^isidore/query$' , isidore.query )
, url(r'^isidore/save/(\d+)' , isidore.save )
]
......@@ -368,7 +368,7 @@
<p>
Gargantext
<span class="glyphicon glyphicon-registration-mark" aria-hidden="true"></span>
, version 3.0.6.8,
, version 3.0.7,
<a href="http://www.cnrs.fr" target="blank" title="Institution that enables this project.">
Copyrights
<span class="glyphicon glyphicon-copyright-mark" aria-hidden="true"></span>
......
......@@ -41,78 +41,77 @@
<div class="container theme-showcase" role="main">
<div class="jumbotron">
<div class="row">
<div class="col-md-4">
<h1>
<span class="glyphicon glyphicon-home" aria-hidden="true"></span>
Projects
</h1>
</div>
<div class="col-md-3"></div>
<div class="col-md-5">
<p id="project" class="help">
<br>
<button id="add" type="button" class="btn btn-primary btn-lg help" data-container="body" data-toggle="popover" data-placement="bottom">
<span class="glyphicon glyphicon-plus" aria-hidden="true"></span>
Add a new project
</button>
<div id="popover-content" class="hide">
<div id="createForm" class="form-group">
{% csrf_token %}
<div id="status-form" class="collapse">
</div>
<div class="row inline">
<label class="col-lg-3" for="inputName" ><span class="pull-right">Name:</span></label>
<input class="col-lg-8" type="text" id="inputName" class="form-control">
</div>
<div class="row inline">
<div class="col-lg-3"></div>
<button id="createProject" class="btn btn-primary btn-sm col-lg-8 push-left">Add Project</button>
<div class="col-lg-2"></div>
<div class="col-md-4">
<h1>
<span class="glyphicon glyphicon-home" aria-hidden="true"></span>
Projects
</h1>
</div>
<div class="col-md-3"></div>
<div class="col-md-5">
<p id="project" class="help">
<br>
<button id="add" type="button" class="btn btn-primary btn-lg help" data-container="body" data-toggle="popover" data-placement="bottom">
<span class="glyphicon glyphicon-plus" aria-hidden="true"></span>
Add a new project
</button>
<div id="popover-content" class="hide">
<form>
<div id="createForm" class="form-group">
{% csrf_token %}
<div id="status-form" class="collapse"></div>
<div class="row inline">
<label class="col-lg-3" for="inputName" ><span class="pull-right">Name:</span></label>
<input class="col-lg-8" type="text" id="inputName" class="form-control">
</div>
<div class="row inline">
<div class="col-lg-3"></div>
<button id="createProject" class="btn btn-primary btn-sm col-lg-8 push-left">Add Project</button>
<div class="col-lg-2"></div>
</div>
</div>
</form>
</div>
</div>
</div>
</p>
</p>
</div>
</div>
</div>
</div>
<div class="container">
<!-- GENERIC STATUS INFO -->
<div id="status" class="row col-lg-12 collapse">
<div class="container">
<!-- GENERIC STATUS INFO -->
<div id="status" class="row col-lg-12 collapse">
<div id="status-msg" class="alert">
</div>
<div id="status-msg" class="alert">
</div>
</div>
<!-- CHECKBOX EDITION -->
<!--
<div class="row collapse" id="editor">
<button title="delete selected project" type="button" class="btn btn-danger" id="delete">
<span class="glyphicon glyphicon-trash " aria-hidden="true" ></span>
</button>
<button title="edit selected project" type="button" class="btn btn-warning" id="edit">
<span class="glyphicon glyphicon-pencil " aria-hidden="true" onclick="editProjects()"></span>
</button> -->
<!-- <button type="button" class="btn btn-info" id="recalculate">
<span class="glyphicon glyphicon-refresh " aria-hidden="true" onclick="recalculateProjects()"></span>
</button>
-->
</div>
<br />
</div>
<!-- CHECKBOX EDITION -->
<!--
<div class="row collapse" id="editor">
<button title="delete selected project" type="button" class="btn btn-danger" id="delete">
<span class="glyphicon glyphicon-trash " aria-hidden="true" ></span>
</button>
<button title="edit selected project" type="button" class="btn btn-warning" id="edit">
<span class="glyphicon glyphicon-pencil " aria-hidden="true" onclick="editProjects()"></span>
</button> -->
<!-- <button type="button" class="btn btn-info" id="recalculate">
<span class="glyphicon glyphicon-refresh " aria-hidden="true" onclick="recalculateProjects()"></span>
</button>
</div>
-->
<br />
<div class="row container" id="projects">
<!--here loading projectlist from GET /projects-->
</div>
<img id="wait-img" width="90%" style="display:none" src="{% static "img/ajax-loader.gif"%}"></img>
<div class="row container" id="projects">
<!--here loading projectlist from GET /projects-->
</div>
</div>
<img id="wait-img" width="90%" style="display:none" src="{% static "img/ajax-loader.gif"%}"></img>
<script type="html/tpl" id="project_item">
<div id="{url}" class="item row">
......
......@@ -675,7 +675,7 @@
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").attr('onclick', testCERN(query, N));
$("#submit_thing").on("click", function(){
saveMultivac(pubmedquery, N);
saveMultivac(pubmedquery, N, "/moissonneurs/multivac/save/");
//$("#submit_thing").onclick()
})}
//(N > {{query_size}})
......@@ -684,7 +684,7 @@
$('#submit_thing').prop('disabled', false);
$("#submit_thing").html("Processing a sample file")
$("#submit_thing").on("click", function(){
saveMultivac(pubmedquery, N);
saveMultivac(pubmedquery, N,"/moissonneurs/multivac/save/" );
//$("#submit_thing").onclick()
})}
}
......@@ -708,7 +708,6 @@
//HAL = 11
if (SourceTypeId == "11"){
$.ajax({
// contentType: "application/json",
......@@ -736,7 +735,7 @@
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").attr('onclick', testCERN(query, N));
$("#submit_thing").on("click", function(){
saveALL(pubmedquery, N);
save(pubmedquery, N, "/moissonneurs/hal/save/");
//$("#submit_thing").onclick()
})}
//(N > {{query_size}})
......@@ -745,7 +744,7 @@
$('#submit_thing').prop('disabled', false);
$("#submit_thing").html("Processing a sample file")
$("#submit_thing").on("click", function(){
saveALL(pubmedquery, N);
save(pubmedquery, N, "/moissonneurs/hal/save/");
//$("#submit_thing").onclick()
})}
}
......@@ -768,6 +767,69 @@
}
//HAL = 12
if (SourceTypeId == "12"){
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/moissonneurs/isidore/query",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log(data)
console.log("SUCCESS")
console.log("enabling "+"#"+value.id)
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
N = data["results_nb"]
if(N > 0) {
if (N <= {{query_size}}){
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$("#submit_thing").html("Download!")
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").attr('onclick', testCERN(query, N));
$("#submit_thing").on("click", function(){
save(pubmedquery, N, "/moissonneurs/isidore/save/");
//$("#submit_thing").onclick()
})}
//(N > {{query_size}})
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$('#submit_thing').prop('disabled', false);
$("#submit_thing").html("Processing a sample file")
$("#submit_thing").on("click", function(){
save(pubmedquery, N, "/moissonneurs/isidore/save/");
//$("#submit_thing").onclick()
})}
}
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>")
if(data[0]==false)
$("#theresults").html(theType +" connection error!</i><br>")
$('#submit_thing').prop('disabled', true);
}
},
error: function(result) {
$("#theresults").html(theType +" connection error</i><br>")
$('#submit_thing').prop('disabled', true);
}
});
}
}
// CSS events for selecting one Radio-Input
......@@ -819,6 +881,7 @@
|| selectedId == "9"
|| selectedId == "10"
|| selectedId == "11"
|| selectedId == "12"
) {
console.log("show the button for: " + selectedId)
$("#div-fileornot").css("visibility", "visible");
......@@ -1001,7 +1064,7 @@
});
}
function saveALL(query, N){
function save(query, N, urlGarg){
console.log("In Gargantext")
if(!query || query=="") return;
......@@ -1016,7 +1079,7 @@
console.log(data)
$.ajax({
dataType: 'json',
url: window.location.origin+"/moissonneurs/hal/save/"+projectid,
url: window.location.origin + urlGarg + projectid,
data: data,
type: 'POST',
beforeSend: function(xhr) {
......
......@@ -26,8 +26,7 @@ environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext.settings")
DATABASES['default']['NAME'] = DATABASES['default']['TEST']['NAME']
setup() # models can now be imported
from gargantext import models # Base is now filled
from gargantext.util.db import Base # contains metadata.tables
from gargantext.models import Base # contains metadata.tables
# ------------------------------------------------------------------------------
# thanks to our hack, util.db.engine and util.db.session already use the test DB
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment