Commit 584511ba authored by PkSM3's avatar PkSM3

Merge branch 'unstable' of ssh://delanoe.org:1979/gargantext into samuel

parents e57bf3af 0f3b14dd
......@@ -6,9 +6,12 @@ from sqlalchemy.sql import func
import numpy as np
import collections
ResourceType = models.ResourceType.sa
Resource = models.Resource.sa
NodeType = models.NodeType.sa
NodeNgram = models.Node_Ngram.sa
NodeNodeNgram = models.NodeNgramNgram.sa
NodeNodeNgram = models.NodeNodeNgram.sa
NodeNgramNgram = models.NodeNgramNgram.sa
Ngram = models.Ngram.sa
Node_Metadata = models.Node_Metadata.sa
Metadata = models.Metadata.sa
......
......@@ -17,11 +17,11 @@ def get_team():
'''
team = [
{ 'first_name' : 'Alexandre', 'last_name' : 'Delanoë', 'mail' : 'alexandre+gargantextATdelanoe.org', 'website' : 'http://alexandre.delanoe.org', 'picture' : 'alexandre.jpg'},
{ 'first_name' : 'David', 'last_name' : 'Chavalarias', 'mail' : 'david.chavalariasATiscpif.fr', 'website' : 'http://chavalarias.com', 'picture' : 'david.jpg'},
{ 'first_name' : 'Mathieu', 'last_name' : 'Rodic', 'mail' : '', 'website' : 'http://rodic.fr', 'picture' : 'mathieu.jpg'},
{ 'first_name' : 'Samuel', 'last_name' : 'Castillo J.', 'mail' : 'kaisleanATgmail.com', 'website' : 'http://www.pksm3.droppages.com', 'picture' : 'samuel.jpg'},
{ 'first_name' : 'Elias', 'last_name' : 'Showk', 'mail' : '', 'website' : 'https://github.com/elishowk', 'picture' : ''},
{ 'first_name' : 'Alexandre', 'last_name' : 'Delanoë', 'mail' : 'alexandre+gargantextATdelanoe.org', 'website' : 'http://alexandre.delanoe.org', 'picture' : 'alexandre.jpg', 'role' : 'project manager, scientific board, developer'},
{ 'first_name' : 'David', 'last_name' : 'Chavalarias', 'mail' : 'david.chavalariasATiscpif.fr', 'website' : 'http://chavalarias.com', 'picture' : 'david.jpg', 'role':'scientific board'},
{ 'first_name' : 'Mathieu', 'last_name' : 'Rodic', 'mail' : '', 'website' : 'http://rodic.fr', 'picture' : 'mathieu.jpg', 'role' : 'developer'},
{ 'first_name' : 'Samuel', 'last_name' : 'Castillo J.', 'mail' : 'kaisleanATgmail.com', 'website' : 'http://www.pksm3.droppages.com', 'picture' : 'samuel.jpg', 'role' : 'developer'},
{ 'first_name' : 'Elias', 'last_name' : 'Showk', 'mail' : '', 'website' : 'https://github.com/elishowk', 'picture' : '', 'role' : 'developer'},
#{ 'first_name' : '', 'name' : '', 'mail' : '', 'website' : '', 'picture' : ''},
# copy paste the line above and write your informations please
]
......
......@@ -10,96 +10,7 @@ from sqlalchemy import text, distinct
from sqlalchemy.sql import func
from sqlalchemy.orm import aliased
from node import models
NodeType = models.NodeType.sa
Node = models.Node.sa
Node_Ngram = models.Node_Ngram.sa
Ngram = models.Ngram.sa
Metadata = models.Metadata.sa
Node_Metadata = models.Node_Metadata.sa
# for debugging only
def literalquery(statement, dialect=None):
"""Generate an SQL expression string with bound parameters rendered inline
for the given SQLAlchemy statement.
WARNING: This method of escaping is insecure, incomplete, and for debugging
purposes only. Executing SQL statements with inline-rendered user values is
extremely insecure.
"""
from datetime import datetime
import sqlalchemy.orm
if isinstance(statement, sqlalchemy.orm.Query):
if dialect is None:
dialect = statement.session.get_bind(
statement._mapper_zero_or_none()
).dialect
statement = statement.statement
if dialect is None:
dialect = getattr(statement.bind, 'dialect', None)
if dialect is None:
from sqlalchemy.dialects import mysql
dialect = mysql.dialect()
Compiler = type(statement._compiler(dialect))
class LiteralCompiler(Compiler):
visit_bindparam = Compiler.render_literal_bindparam
def render_literal_value(self, value, type_):
return "'" + str(value) + "'"
# if isinstance(value, (float, int)):
# return str(value)
# elif isinstance(value, datetime):
# return repr(str(value))
# else: # fallback
# value = super(LiteralCompiler, self).render_literal_value(
# value, type_,
# )
# if isinstance(value, unicode):
# return value.encode('UTF-8')
# else:
# return value
return LiteralCompiler(dialect, statement)
# these might be used for SQLAlchemy
def get_session():
import sqlalchemy.orm
from django.db import connections
from sqlalchemy.orm import sessionmaker
from aldjemy.core import get_engine
alias = 'default'
connection = connections[alias]
engine = get_engine()
Session = sessionmaker(bind=engine)
return Session()
def get_connection():
from aldjemy.core import get_engine
engine = get_engine()
return engine.connect()
# for recursive queries
# _sql_cte = '''
# WITH RECURSIVE cte ("depth", "path", "ordering", "id") AS (
# SELECT 1 AS depth,
# array[T."id"] AS path,
# array[T."id"] AS ordering,
# T."id"
# FROM %s T
# WHERE T."parent_id" IS NULL
# UNION ALL
# SELECT cte.depth + 1 AS depth,
# cte.path || T."id",
# cte.ordering || array[T."id"],
# T."id"
# FROM %s T
# JOIN cte ON T."parent_id" = cte."id"
# )
# ''' % (Node._meta.db_table, Node._meta.db_table, )
from .db import *
def DebugHttpResponse(data):
......@@ -235,7 +146,7 @@ class NodesChildrenDuplicates(APIView):
)
# build the query
groups = list(columns)
duplicates_query = (get_session()
duplicates_query = (session
.query(*(extra_columns + [func.count()] + columns))
.select_from(Node)
)
......@@ -273,7 +184,6 @@ class NodesChildrenDuplicates(APIView):
})
def delete(self, request, node_id):
session = get_session()
# get the minimum ID for each of the nodes sharing the same metadata
kept_node_ids_query = self._fetch_duplicates(request, node_id, [func.min(Node.id).label('id')], 0)
kept_node_ids = [kept_node.id for kept_node in kept_node_ids_query]
......@@ -397,6 +307,10 @@ class NodesChildrenQueries(APIView):
# return value
return field, _operators[operator], value
def _count_documents(self, query):
return {
'fields': []
}
def post(self, request, node_id):
""" Query the children of the given node.
......@@ -509,7 +423,7 @@ class NodesChildrenQueries(APIView):
# starting the query!
document_type_id = NodeType.query(NodeType.id).filter(NodeType.name == 'Document').scalar()
query = (get_session()
query = (session
.query(*fields_list)
.select_from(Node)
.filter(Node.type_id == document_type_id)
......@@ -656,7 +570,7 @@ class NodesList(APIView):
class Nodes(APIView):
def get(self, request, node_id):
node = models.Node.objects.filter(id = node_id).first()
node = session.query(Node).filter(Node.id == node_id).first()
if node is None:
raise APIException('This node does not exist', 404)
return JsonHttpResponse({
......@@ -667,9 +581,10 @@ class Nodes(APIView):
})
# deleting node by id
# currently, very dangerous
# currently, very dangerous.
# it should take the subnodes into account as well,
# for better constistency...
def delete(self, request, node_id):
session = get_session()
node = models.Node.objects.filter(id = node_id)
msgres = ""
try:
......
from node import models
from gargantext_web import settings
__all__ = ['literalquery', 'session', 'cache']
# map the Django models found in node.models to SQLAlchemy models
for model_name, model in models.__dict__.items():
if hasattr(model, 'sa'):
globals()[model_name] = model.sa
__all__.append(model_name)
NodeNgram = Node_Ngram
# debugging tool, to translate SQLAlchemy queries to string
def literalquery(statement, dialect=None):
"""Generate an SQL expression string with bound parameters rendered inline
for the given SQLAlchemy statement.
WARNING: This method of escaping is insecure, incomplete, and for debugging
purposes only. Executing SQL statements with inline-rendered user values is
extremely insecure.
"""
from datetime import datetime
import sqlalchemy.orm
if isinstance(statement, sqlalchemy.orm.Query):
if dialect is None:
dialect = statement.session.get_bind(
statement._mapper_zero_or_none()
).dialect
statement = statement.statement
if dialect is None:
dialect = getattr(statement.bind, 'dialect', None)
if dialect is None:
from sqlalchemy.dialects import mysql
dialect = mysql.dialect()
Compiler = type(statement._compiler(dialect))
class LiteralCompiler(Compiler):
visit_bindparam = Compiler.render_literal_bindparam
def render_literal_value(self, value, type_):
return "'" + str(value) + "'"
if isinstance(value, (float, int)):
return str(value)
elif isinstance(value, datetime):
return repr(str(value))
else:
if isinstance(value, str):
return value.encode('UTF-8')
else:
return value
return LiteralCompiler(dialect, statement)
# SQLAlchemy session management
def get_sessionmaker():
from django.db import connections
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
alias = 'default'
connection = connections[alias]
url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}/{NAME}'.format(
**settings.DATABASES['default']
)
engine = create_engine(url, use_native_hstore=True)
return sessionmaker(bind=engine)
Session = get_sessionmaker()
session = Session()
# SQLAlchemy model objects caching
from sqlalchemy import or_
class ModelCache(dict):
def __init__(self, model, preload=False):
self._model = model.sa
self._columns_names = [column.name for column in model._meta.fields if column.unique]
self._columns = [getattr(self._model, column_name) for column_name in self._columns_names]
self._columns_validators = []
if preload:
self.preload()
def __missing__(self, key):
for column in self._columns:
conditions = []
try:
formatted_key = column.type.python_type(key)
conditions.append(column == key)
except ValueError:
pass
if formatted_key in self:
self[key] = self[formatted_key]
else:
element = session.query(self._model).filter(or_(*conditions)).first()
if element is None:
raise KeyError
self[key] = element
return element
def preload(self):
self.clear()
for element in session.query(self._model).all():
for column_name in self._columns_names:
key = getattr(element, column_name)
self[key] = element
class Cache:
def __getattr__(self, key):
try:
model = getattr(models, key)
except AttributeError:
raise AttributeError('No such model: `%s`' % key)
modelcache = ModelCache(model)
setattr(self, key, modelcache)
return modelcache
cache = Cache()
......@@ -25,7 +25,7 @@ urlpatterns = patterns('',
# User Home view
url(r'^$', views.home),
url(r'^about/', views.about),
url(r'^about/', views.get_about),
# Project Management
url(r'^projects/$', views.projects),
......@@ -53,13 +53,13 @@ urlpatterns = patterns('',
# Data management
url(r'^api$', gargantext_web.api.Root),
url(r'^api/nodes$', gargantext_web.api.NodesList.as_view()),
url(r'^api/nodes/(\d+)$', gargantext_web.api.Nodes.as_view()),
url(r'^api/nodes/(\d+)/children/ngrams$', gargantext_web.api.NodesChildrenNgrams.as_view()),
url(r'^api/nodes/(\d+)/children/metadata$', gargantext_web.api.NodesChildrenMetatadata.as_view()),
url(r'^api/nodes/(\d+)/children/queries$', gargantext_web.api.NodesChildrenQueries.as_view()),
url(r'^api/nodes/(\d+)/children/duplicates$', gargantext_web.api.NodesChildrenDuplicates.as_view()),
# url(r'^api/nodes/(\d+)/children/duplicates/delete$', gargantext_web.api.NodesChildrenDuplicates.delete ),
url(r'^api/nodes/(\d+)$', gargantext_web.api.Nodes.as_view()),
url(r'^api/nodes$', gargantext_web.api.NodesList.as_view()),
url(r'^api/project/(\d+)/corpus/(\d+)/timerange/(\d+)/(\d+)$', views.subcorpusJSON),
......
......@@ -5,6 +5,7 @@ from django.http import Http404, HttpResponse, HttpResponseRedirect
from django.template.loader import get_template
from django.template import Context
from node import models
from node.models import Language, ResourceType, Resource, \
Node, NodeType, Node_Resource, Project, Corpus, \
Ngram, Node_Ngram, NodeNgramNgram, NodeNodeNgram
......@@ -144,8 +145,8 @@ def date_range(start_dt, end_dt = None, format=None):
# SOME VIEWS
from gargantext_web import team
def about(request):
from gargantext_web import about
def get_about(request):
'''
About Gargantext, the team and sponsors
'''
......@@ -153,8 +154,8 @@ def about(request):
user = request.user
date = datetime.datetime.now()
members = team.get_team()
sponsors = team.get_sponsors()
members = about.get_team()
sponsors = about.get_sponsors()
html = template.render(Context({\
'user': user,\
......@@ -243,12 +244,12 @@ def project(request, project_id):
type_corpus = NodeType.objects.get(name='Corpus')
type_document = NodeType.objects.get(name='Document')
type_whitelist = NodeType.objects.get(name='WhiteList')
type_blacklist = NodeType.objects.get(name='BlackList')
type_cooclist = NodeType.objects.get(name='Cooccurrence')
# type_whitelist = NodeType.objects.get(name='WhiteList')
# type_blacklist = NodeType.objects.get(name='BlackList')
# type_cooclist = NodeType.objects.get(name='Cooccurrence')
project = Node.objects.get(id=project_id)
corpora = project.children.filter(type=type_corpus)
corpora = Node.objects.filter(parent=project, type=type_corpus)
number = len(corpora)
# DONUT corpora representation
......@@ -265,13 +266,13 @@ def project(request, project_id):
for corpus in corpora:
# print("corpus", corpus.pk , corpus.name , corpus.type_id)
docs_count = corpus.children.count()
docs_count = Node.objects.filter(parent=corpus, type=type_document).count()
docs_total += docs_count
corpus_view = dict()
corpus_view['id'] = corpus.pk
corpus_view['name'] = corpus.name
corpus_view['count'] = corpus.children.count()
corpus_view['count'] = docs_count
#just get first element of the corpora and get his type.
......
......@@ -29,6 +29,9 @@ sudo apt-get install gfortran
sudo apt-get install libopenblas-dev
sudo apt-get install liblapack-dev
#nlpserver
sudo apt-get install libgflags-dev
sudo aptitude install libgoogle-glog-dev
source /srv/gargantext_env/bin/activate
pip3 install git+https://github.com/mathieurodic/aldjemy.git
-- Indexing text fields
CREATE INDEX node_node_name ON node_node (name);
CREATE INDEX node_node_metadata_valuetext ON node_node_metadata (value_text);
CREATE INDEX node_ngram_terms ON node_ngram (terms);
-- indexing ALL foreing keys
CREATE INDEX node_ngram__language_id ON node_ngram (language_id);
CREATE INDEX node_node__type_id ON node_node (type_id);
CREATE INDEX node_node__user_id ON node_node (user_id);
CREATE INDEX node_node__language_id ON node_node (language_id);
CREATE INDEX node_node__parent_id ON node_node (parent_id);
CREATE INDEX node_node_metadata__node_id ON node_node_metadata (node_id);
CREATE INDEX node_node_metadata__metadata_id ON node_node_metadata (metadata_id);
CREATE INDEX node_node_ngram__ngram_id ON node_node_ngram (ngram_id);
CREATE INDEX node_node_ngram__node_id ON node_node_ngram (node_id);
CREATE INDEX node_nodengramngram__node_id ON node_nodengramngram (node_id);
CREATE INDEX node_nodengramngram__ngramx_id ON node_nodengramngram (ngramx_id);
CREATE INDEX node_nodengramngram__ngramy_id ON node_nodengramngram (ngramy_id);
CREATE INDEX node_nodenodengram__nodey_id ON node_nodenodengram (nodey_id);
CREATE INDEX node_nodenodengram__ngram_id ON node_nodenodengram (ngram_id);
CREATE INDEX node_nodenodengram__nodex_id ON node_nodenodengram (nodex_id);
CREATE INDEX node_node_resource__node_id ON node_node_resource (node_id);
CREATE INDEX node_node_resource__resource_id ON node_node_resource (resource_id);
CREATE INDEX node_resource__user_id ON node_resource (user_id);
CREATE INDEX node_resource__type_id ON node_resource (type_id);
......@@ -33,15 +33,15 @@ def _upload_to(instance, filename):
class Language(models.Model):
iso2 = models.CharField(max_length=2, unique=True)
iso3 = models.CharField(max_length=3)
fullname = models.CharField(max_length=255)
iso3 = models.CharField(max_length=3, unique=True)
fullname = models.CharField(max_length=255, unique=True)
implemented = models.BooleanField(blank=True)
def __str__(self):
return self.fullname
class ResourceType(models.Model):
name = models.CharField(max_length=255)
name = models.CharField(max_length=255, unique=True)
def __str__(self):
return self.name
......@@ -49,7 +49,7 @@ class ResourceType(models.Model):
class Ngram(models.Model):
language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL)
n = models.IntegerField()
terms = models.CharField(max_length=255)
terms = models.CharField(max_length=255, unique=True)
nodes = models.ManyToManyField(through='Node_Ngram', to='Node')
def __str__(self):
......@@ -66,7 +66,7 @@ class Resource(models.Model):
return self.file
class NodeType(models.Model):
name = models.CharField(max_length=200)
name = models.CharField(max_length=200, unique=True)
def __str__(self):
return self.name
......
import node.models
from parsing.NgramsExtractors import *
from .NgramsExtractors import *
from collections import defaultdict
......
from parsing.FileParsers.RisFileParser import RisFileParser
from parsing.FileParsers.IsiFileParser import IsiFileParser
from parsing.FileParsers.PubmedFileParser import PubmedFileParser
from parsing.FileParsers.EuropressFileParser import EuropressFileParser
from parsing.FileParsers.ISText import ISText
from .RisFileParser import RisFileParser
from .IsiFileParser import IsiFileParser
from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser
from parsing.NgramsExtractors.NgramsExtractor import NgramsExtractor
from parsing.Taggers import TreeTagger
from .NgramsExtractor import NgramsExtractor
from ..Taggers import TreeTagger
class FrenchNgramsExtractor(NgramsExtractor):
......
from parsing.Taggers import Tagger
from ..Taggers import Tagger
import nltk
......@@ -33,14 +33,14 @@ class NgramsExtractor:
grammar = nltk.RegexpParser(self._rule)
result = []
try:
grammar_parsed = grammar.parse(tagged_ngrams)
for subtree in grammar_parsed.subtrees():
if subtree.label() == self._label:
result.append(subtree.leaves())
except:
print("Problem while parsing rule '%s'" % (self._rule, ))
pass
# try:
grammar_parsed = grammar.parse(tagged_ngrams)
for subtree in grammar_parsed.subtrees():
if subtree.label() == self._label:
result.append(subtree.leaves())
# except Exception as e:
# print("Problem while parsing rule '%s'" % (self._rule, ))
# print(e)
return result
......
from .NgramsExtractor import NgramsExtractor
from ..Taggers import TurboTagger
class TurboNgramsExtractor(NgramsExtractor):
def start(self):
self.tagger = TurboTagger()
from parsing.NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
from parsing.NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
from parsing.NgramsExtractors.NgramsExtractor import NgramsExtractor
from .FrenchNgramsExtractor import FrenchNgramsExtractor
from .TurboNgramsExtractor import TurboNgramsExtractor as EnglishNgramsExtractor
# from parsing.NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
from .NgramsExtractor import NgramsExtractor
from .Tagger import Tagger
from .nlpserver.client import NLPClient
class TurboTagger:
def start(self):
self._nlpclient = NLPClient()
def stop(self):
if hasattr(self, '_nlpclient'):
del self._nlpclient
def tag_text(self, text):
if not hasattr(self, '_nlpclient'):
self._nlpclient = NLPClient()
tokens_tags = []
for sentence in self._nlpclient.tag(text):
for token, tag in sentence:
tokens_tags.append((token, tag, ))
return tokens_tags
from parsing.Taggers.Tagger import Tagger
from parsing.Taggers.NltkTagger import NltkTagger
from parsing.Taggers.TreeTagger import TreeTagger
from .Tagger import Tagger
from .NltkTagger import NltkTagger
from .TreeTagger import TreeTagger
from .TurboTagger import TurboTagger
GETTING STARTED
===============
* Download the following files (if all you need is tagging, the second
archive is not necessary):
- http://www.ark.cs.cmu.edu/TurboParser/sample_models/english_proj_tagger.tar.gz
- http://www.ark.cs.cmu.edu/TurboParser/sample_models/english_proj_parser.tar.gz
* Expand them, and place the extract files in the `data` directory
CONFIGURATION
=============
The settings for the server can be found in `settings.py`.
Please ensure the TCP port is not already in use on your machine, and that the path to the models are correct.
START/STOP THE SERVER
=====================
Simply run the following command to start: `./nlpserver start`
To stop: `./nlpserver stop`
If starting the server failed, have a look at the log in `nlpserver.log`.
import socket
import sys
import re
from .settings import server_type_client, server_host, server_port, server_buffer
from .settings import implemented_methods
class NLPClient:
def __init__(self):
self._socket = socket.socket(*server_type_client)
self._socket.connect((server_host, server_port))
for method_name in dir(self):
if method_name[0] != '_':
if method_name.upper() not in implemented_methods:
setattr(self, method_name, self._notimplemented)
def __del__(self):
self._socket.close()
def _notimplemented(self, *args, **kwargs):
raise NotImplementedError(
'Only the following methods are allowed: {}'.format(
', '.join(implemented_methods)
)
)
def _getline(self):
"""Get one line of text from the buffer
"""
buf = self._socket.recv(server_buffer).decode()
done = False
while not done:
if '\n' in buf:
line, buf = buf.split('\n', 1)
yield line
else:
more = self._socket.recv(server_buffer).decode()
if not more:
done = True
else:
buf += more
if buf:
yield buf
def _request(self, action, text, language, keys=None):
"""Generic method to request info from the server
"""
data = action + ' '
data += language + '\n'
data += re.sub(r'\n+', '\n', text)
data += '\n\n'
self.__init__()
self._socket.sendall(data.encode())
sentence = []
if keys is None:
for line in self._getline():
if not line:
if not sentence:
break
yield sentence
sentence = []
continue
sentence.append(line.split('\t'))
else:
for line in self._getline():
if not line:
if not sentence:
break
yield sentence
sentence = []
continue
values = line.split('\t')
sentence.append(dict(zip(keys, line.split('\t'))))
self.__del__()
def tokenize(self, text, language='english', asdict=False):
keys = ('token', ) if asdict else None
return self._request('TOKENIZE', text, language, keys)
def tag(self, text, language='english', asdict=False):
keys = ('token', 'tag', ) if asdict else None
return self._request('TAG', text, language, keys)
def lemmatize(self, text, language='english', asdict=False):
keys = ('token', 'tag', 'lemma') if asdict else None
return self._request('LEMMATIZE', text, language, keys)
def parse(self, text, language='english', asdict=False):
keys = ('token', 'tag', 'lemma', 'head', 'deprel', ) if asdict else None
return self._request('PARSE', text, language, keys)
# Benchmark when the script is called directly
if __name__ == '__main__':
from time import time
text = """Current therapeutics for schizophrenia, the typical and atypical antipsychotic class of drugs, derive their therapeutic benefit predominantly by antagonism of the dopamine D2 receptor subtype and have robust clinical benefit on positive symptoms of the disease with limited to no impact on negative symptoms and cognitive impairment. Driven by these therapeutic limitations of current treatments and the recognition that transmitter systems beyond the dopaminergic system in particular glutamatergic transmission contribute to the etiology of schizophrenia significant recent efforts have focused on the discovery and development of novel treatments for schizophrenia with mechanisms of action that are distinct from current drugs. Specifically, compounds selectively targeting the metabotropic glutamate receptor 2/3 subtype, phosphodiesterase subtype 10, glycine transporter subtype 1 and the alpha7 nicotinic acetylcholine receptor have been the subject of intense drug discovery and development efforts. Here we review recent clinical experience with the most advanced drug candidates targeting each of these novel mechanisms and discuss whether these new agents are living up to expectations."""
text = open('/home/mat/projects/parser/animal-farm.txt').read()
client = NLPClient()
iterations = int(sys.argv[1]) if len(sys.argv) > 1 else 1
for asdict in (False, True):
print()
print('Retrieving results as ' + (
'dict' if asdict else 'list'
) + 's')
print('---------------------------')
for method_name in dir(client):
if method_name[0] != '_':
method = getattr(client, method_name)
print('%-16s' % method_name, end='')
t0 = time()
n = 0.0
for i in range(0, iterations):
try:
for sentence in method(text, asdict=asdict):
n += 1.0
t = time() - t0
print('%8.2f s %8.2f ms per sentence' % (t, 1000*t/n if n else 0.0))
except NotImplementedError:
print('(not implemented)')
print()
# lemmatize 2.89 s 1.76 ms per sentence
# parse 25.21 s 15.37 ms per sentence
# tag 2.90 s 1.77 ms per sentence
# tokenize 0.19 s 0.12 ms per sentence
*.model
\ No newline at end of file
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
lemmatizer = WordNetLemmatizer()
_lemmatize = lemmatizer.lemmatize
tags_translate = defaultdict(str)
tags_translate.update({
'J': 'a',
'N': 'n',
'V': 'v',
})
def lemmatize(token, tag):
tag_type = tags_translate[tag[0]]
return _lemmatize(token, tag_type) if tag_type else token
#!/bin/sh
# In case this bash file is placed in another directory (e.g., /etc/init.d),
# the following line should be changed to an absolute path
DAEMON_DIR=$( cd "$(dirname "$BASH_SOURCE[0]")" && pwd)
DAEMON_SCRIPT=$DAEMON_DIR/server.py
DAEMON_NAME=nlpserver
DAEMON_ARGS=
# DAEMON_USER=root
# The process ID of the script when it runs is stored here:
DAEMON_PID=/tmp/$DAEMON_NAME.pid
. /lib/lsb/init-functions
do_start () {
log_daemon_msg "Starting system '$DAEMON_NAME' daemon..."
/sbin/start-stop-daemon --start --quiet \
--make-pidfile --pidfile $DAEMON_PID --background \
--startas /bin/bash -- -c "python3 $DAEMON_SCRIPT $DAEMON_ARGS > /tmp/$DAEMON_NAME.log 2>&1"
# --exec $DAEMON_SCRIPT \
# --user $DAEMON_USER --chuid $DAEMON_USER
log_end_msg $?
}
do_stop () {
log_daemon_msg "Stopping system '$DAEMON_NAME' daemon..."
/sbin/start-stop-daemon --stop --pidfile $DAEMON_PID --retry 10
log_end_msg $?
}
case "$1" in
start|stop)
do_${1}
;;
restart|reload|force-reload)
do_stop
do_start
;;
status)
status_of_proc "$DAEMON_NAME" "$DAEMON" && exit 0 || exit $?
;;
*)
echo "Usage: $DAEMON_NAME {start|stop|restart|status}"
exit 1
;;
esac
exit 0
from settings import *
from sys import stderr
def print(text):
stderr.write(text + '\n')
print('PREPARING TURBOPARSER')
import turboparser
turbo_interface = turboparser.PTurboParser()
print('LOADING TOKENIZERS')
import nltk
sentence_tokenizer = nltk.data.load(tokenizer_model)
word_tokenizer = nltk.TreebankWordTokenizer()
if 'TAG' in implemented_methods or 'LEMMATIZE' in implemented_methods:
print('LOADING TAGGER')
tagger = turbo_interface.create_tagger()
tagger.load_tagger_model(b_tagger_model)
if 'LEMMATIZE' in implemented_methods or 'TAG' in implemented_methods or 'PARSE' in implemented_methods:
print('LOADING LEMMATIZER')
from lemmatizer import lemmatize
if 'PARSE' in implemented_methods:
print('LOADING PARSER')
parser = turbo_interface.create_parser()
parser.load_parser_model(b_parser_model)
def split_sentences(text):
return sentence_tokenizer.tokenize(text)
def tokenize(sentence):
return word_tokenizer.tokenize(sentence)
def tag_sentence(sentence):
# Write tokens to input file
f_input = open(tmp_input_path, 'w')
for token in tokenize(sentence):
f_input.write(token + '\t_\n')
f_input.close()
# Tag tokens
tagger.tag(b_tmp_input_path, b_tmp_output_path)
# Iterate through tagged tokens
f_output = open(tmp_output_path)
for line in f_output:
line = line.rstrip('\n')
if line == '':
continue
token, tag = line.split('\t')
yield (token, tag)
f_output.close()
def tag_lemmatize_sentence(sentence):
# Write tokens to input file
f_input = open(tmp_input_path, 'w')
for token in tokenize(sentence):
f_input.write(token + '\t_\n')
f_input.close()
# Tag tokens
tagger.tag(b_tmp_input_path, b_tmp_output_path)
# Iterate through tagged tokens
f_output = open(tmp_output_path)
for line in f_output:
line = line.rstrip('\n')
if line == '':
continue
token, tag = line.split('\t')
lemma = lemmatize(token, tag)
yield (token, tag, lemma)
f_output.close()
def parse_sentence(sentence):
# Write tokens to input file
f_input = open(tmp_input_path, 'w')
# Iterate through tagged tokens, prepare input
i = 0
for token, tag, lemma in tag_lemmatize_sentence(sentence):
i += 1
f_input.write(
# position
str(i) + '\t' +
# token
token + '\t' +
# lemma
lemma + '\t' +
# tag (twice)
tag + '\t' +
tag + '\t' +
# filler
'_\t_\t_\n'
)
f_input.close()
# Parse sentence
parser.parse(b_tmp_input_path, b_tmp_output_path)
# Iterate through parsed stuff
f_output = open(tmp_output_path)
for line in f_output:
line = line.rstrip('\n')
if line == '':
continue
fields = line.split('\t')
#
token = fields[1]
lemma = fields[2]
tag = fields[3]
head = str(int(fields[6]) - 1)
deprel = fields[7]
yield (token, tag, head, deprel)
#!python3
import pipeline
import socketserver
from settings import server_type_server, server_host, server_port, server_timeout
from settings import b_implemented_methods
actions = {
b'TAG': pipeline.tag_sentence,
b'LEMMATIZE': pipeline.tag_lemmatize_sentence,
b'PARSE': pipeline.parse_sentence,
}
class NLPServer(socketserver.StreamRequestHandler):
def handle(self):
# What kind of request are we handling?
firstline = self.rfile.readline()
parameters = firstline.split()
if len(parameters) != 2:
self.wfile.write(b'\n\n')
return
action, language = parameters
if action not in b_implemented_methods:
self.wfile.write(b'\n\n')
return
# Get the text data
text = ''
while True:
line = self.rfile.readline().decode()
if not line.strip():
break
text += line
text += '\n'
# Execute the action
method = actions.get(action, None)
if method is None:
for sentence in pipeline.split_sentences(text):
for token in pipeline.tokenize(sentence):
self.wfile.write(
token.encode() + b'\n'
)
self.wfile.write(b'\n')
self.wfile.write(b'\n')
else:
for sentence in pipeline.split_sentences(text):
for row in method(sentence):
self.wfile.write(
(
'\t'.join(row)
).encode() + b'\n'
)
self.wfile.write(b'\n')
self.wfile.write(b'\n')
def handle_timeout(self):
self.request.sendall(b'\n\n')
if __name__ == '__main__':
print('STARTING TCP SERVER')
server = server_type_server((server_host, server_port), NLPServer)
server.timeout = server_timeout
try:
server.serve_forever()
except (KeyboardInterrupt, SystemExit):
print('STOPPING TCP SERVER')
server.shutdown()
import os
import socket
import socketserver
# Server parameters
server_host = 'localhost'
server_port = 1234
server_type_server = socketserver.TCPServer
server_type_client = socket.AF_INET, socket.SOCK_STREAM
server_timeout = 2.0
server_buffer = 4096
# Implemented methods (other are treated as 'tokenize')
implemented_methods = {'TOKENIZE', 'TAG', 'LEMMATIZE'}
# server_methods = {'TOKENIZE', 'TAG', 'LEMMATIZE', 'PARSE'}
b_implemented_methods = {name.encode() for name in implemented_methods}
# Models
data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')
tokenizer_model = os.path.join(data_dir, 'english.pickle')
tagger_model = os.path.join(data_dir, 'english_proj_tagger.model')
# parser_model = 'data/210basic_sd330'
parser_model = os.path.join(data_dir, 'english_proj_parser_pruned-true_model-full.model')
b_tagger_model = tagger_model.encode()
b_parser_model = parser_model.encode()
# Temporary files access
tmp_input_path = '/tmp/nlpserver_input.tmp'
tmp_output_path = '/tmp/nlpserver_output.tmp'
b_tmp_input_path = tmp_input_path.encode()
b_tmp_output_path = tmp_output_path.encode()
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
......@@ -296,8 +296,8 @@ gargantext.controller("DatasetController", function($scope, $http) {
return defaults.concat(transform);
}
return $http.get(url, {
cache: true,
transformResponse: appendTransform($http.defaults.transformResponse, function(value) {
console.log(value.data)
return value.data;
})
});
......
var operators = {
'string': [
{'label': 'starts with', 'key': 'startswith'},
{'label': 'contains', 'key': 'contains'},
{'label': 'ends with', 'key': 'endswith'},
{'label': 'is', 'key': '='},
{'label': 'is before', 'key': '<'},
{'label': 'is after', 'key': '>'}
],
'integer': [
{'label': 'is', 'key': '='},
{'label': 'is lower than', 'key': '<'},
{'label': 'is higher than', 'key': '>'}
],
'float': [
{'label': 'is', 'key': '='},
{'label': 'is lower than', 'key': '<'},
{'label': 'is higher than', 'key': '>'}
],
'datetime': [
{'label': 'is', 'key': '='},
{'label': 'is before', 'key': '<'},
{'label': 'is after', 'key': '>'}
],
};
var MetadataList = Backbone.Model.extend({
urlRoot: '/api/nodes/(:/nodeId)/children/metadata',
defaults: function() {
return {
key: '',
type: 'string',
values: null,
};
}
});
var Filter = Backbone.Model.extend({
defaults: function() {
return {
entity: null,
key: null,
transformation: null,
operator: null,
value: null
};
}
});
var FilterList = Backbone.Collection.extend({
model: Filter
});
var FilterView = Backbone.View.extend({
tagName: 'li',
className: 'filter',
template: _.template($('#filter-template').html()),
events: {
'change select[name=entity]': 'changeEntity',
'click button.remove': 'clear'
},
initialize: function(){
// this.model.bind('reset', this.render);
// this.model.bind('change', this.render);
// this.model.bind('destroy', this.clear);
// this.render();
},
changeEntity: function(){
alert('CHANGE')
},
render: function() {
// this.$el.html(this.template(this.model.toJSON()));
this.$el.html(this.template({}));
return this;
},
clear: function() {
// alert('CLEAR');
// this.model.invoke('destroy');
this.model.destroy();
}
});
var FilterListView = Backbone.View.extend({
tagName: 'div',
className: 'filters',
template: _.template($('#filterlist-template').html()),
events: {
'click button.add': 'addOne'
},
initialize: function(parameters) {
this.filterList = new FilterList();
this.metadataList = new MetadataList({nodeId: parameters.nodeId});
console.log(this.metadataList.fetch({nodeId: parameters.nodeId}))
this.listenTo(this.filterList, 'add', this.addOne);
this.listenTo(this.filterList, 'reset', this.addAll);
this.listenTo(this.filterList, 'all', this.render);
},
render: function() {
// render template
var rendered = this.$el.html(this.template({}));
// return the object
return this;
},
addOne: function(filter){
var view = new FilterView({model: filter});
this.$('ul.filters').append(
view.render(filter).$el
);
}
});
var operators = {
'string': [
{'label': 'starts with', 'key': 'startswith'},
{'label': 'contains', 'key': 'contains'},
{'label': 'ends with', 'key': 'endswith'},
{'label': 'is', 'key': '='},
{'label': 'is before', 'key': '<'},
{'label': 'is after', 'key': '>'}
],
'integer': [
{'label': 'is', 'key': '='},
{'label': 'is lower than', 'key': '<'},
{'label': 'is higher than', 'key': '>'}
],
'float': [
{'label': 'is', 'key': '='},
{'label': 'is lower than', 'key': '<'},
{'label': 'is higher than', 'key': '>'}
],
'datetime': [
{'label': 'is', 'key': '='},
{'label': 'is before', 'key': '<'},
{'label': 'is after', 'key': '>'}
],
};
// MODELS
var Metadata = can.Model({
findAll: 'GET /api/nodes/{parent}/children/metadata'
});
var QueriedNodeList = can.Model({
findAll: 'POST /api/nodes/{parent}/children/queries'
});
var Filter = can.Model({
findAll: function(){
return $.Deferred().resolve([]);
},
findOne: function(){
return $.Deferred().resolve(undefined);
},
update: function(){
return $.Deferred().resolve();
},
destroy: function(){
return $.Deferred().resolve();
}
}, {});
// CONTROLLERS
var FilterController = can.Control.extend({
'init': function(element, options){
this.element = element;
Filter.findAll({}, function(filter){
element.append(
$(can.view('FilterView', {filter: filter}))
);
});
this.element.find('li select[name=entity]').each(function() {
$(this).change();
});
},
'li select[name=entity] change': 'changeEntity',
'changeEntity': function(element, event){
var entityName = this.element.find('select[name=entity]').val();
element.closest('li')
.find('span.entity').hide()
.filter('.' + entityName).show();
// alert(value);
}
});
var FilterListController = can.Control.extend({
'init': function(element, options){
this.element = element.html( can.view('FilterListView', options) );
// Filter.findAll({}, function(filters){
// var el = this.element;
// el.html( can.view('filterView', {filters: filters}) )
// });
// metadata = Metadata.findAll(parameters);
},
'button.create click': function(){
var filterController = new FilterController(
this.element.find('ul.filters')
);
},
'filter': function(filter){
this.options.filter = filter;
this.on();
},
'{filter} destroyed': function(){
this.element.hide();
}
});
// var Query = can.Model({
// 'init' : function(parameters) {
// }
// });
This diff is collapsed.
This diff is collapsed.
......@@ -15,9 +15,24 @@
<div class="container theme-showcase" role="main">
<div class="jumbotron">
<div class="row">
<div class="col-md-4 content">
<h1>Gargantext</h1>
<p>A web platform to explore text-mining</p>
<a class="btn btn-primary btn-lg" href="/projects">Test Gargantext</a>
</div>
<div class="col-md-3 content">
</div>
<div class="col-md-5 content">
<!--
<h3>Project Manager:</h3>
<h4><a href="http://alexandre.delanoe.org" target="blank">Alexandre Delanoë</a></h4>
<h3>Scientific board:</h3>
<h4><a href="http://chavalarias.com" target="blank">David Chavalarias</a> and <a href="http://alexandre.delanoe.org" target="blank">Alexandre Delanoë</a></h4>
<h3><a href="/about/#collapseTeam" target="blank">Thanks to all the team</a></h3>
--!>
</div>
</div>
</div>
<div class="container">
......
......@@ -427,7 +427,8 @@
//CSS events for changing the Select element
function CustomForSelect( selected ) {
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
if(selected=="pubmed" || selected=="istext") {
//if(selected=="pubmed" || selected=="istext") {
if(selected=="pubmed") {
console.log("show the button for: "+selected)
$("#pubmedcrawl").css("visibility", "visible");
$("#pubmedcrawl").show();
......
......@@ -249,7 +249,7 @@
</span>
<span ng-if="filter.entity.key == 'ngrams'">
are in this list:
<tags-input ng-model="filter.value" display-property="terms" placeholder="Add an ngram" on-tag-added="updateQuery()" on-tag-removed="updateQuery()" add-from-autocomplete-only="true">
<tags-input ng-model="filter.value" display-property="terms" placeholder="Add an ngram" on-tag-added="updateQuery()" on-tag-removed="updateQuery()" add-from-autocomplete-only="true" replace-spaces-with-dashes="false">
<auto-complete source="getNgrams($query)"></auto-complete>
</tags-input ng-model="tags">
</span>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment