Commit 06a5ba47 authored by PkSM3's avatar PkSM3

[MERGE] maybe this works?

parents f9a08e53 b57ae7fe
...@@ -82,7 +82,7 @@ class NodesChildrenNgrams(APIView): ...@@ -82,7 +82,7 @@ class NodesChildrenNgrams(APIView):
def get(self, request, node_id): def get(self, request, node_id):
# query ngrams # query ngrams
ParentNode = aliased(Node) ParentNode = aliased(Node)
ngrams_query = (Ngram ngrams_query = (session
.query(Ngram.terms, func.count().label('count')) .query(Ngram.terms, func.count().label('count'))
# .query(Ngram.id, Ngram.terms, func.count().label('count')) # .query(Ngram.id, Ngram.terms, func.count().label('count'))
.join(Node_Ngram, Node_Ngram.ngram_id == Ngram.id) .join(Node_Ngram, Node_Ngram.ngram_id == Ngram.id)
...@@ -128,7 +128,7 @@ class NodesChildrenDuplicates(APIView): ...@@ -128,7 +128,7 @@ class NodesChildrenDuplicates(APIView):
raise APIException('Missing GET parameter: "keys"', 400) raise APIException('Missing GET parameter: "keys"', 400)
keys = request.GET['keys'].split(',') keys = request.GET['keys'].split(',')
# metadata retrieval # metadata retrieval
metadata_query = (Metadata metadata_query = (session
.query(Metadata) .query(Metadata)
.filter(Metadata.name.in_(keys)) .filter(Metadata.name.in_(keys))
) )
...@@ -213,7 +213,7 @@ class NodesChildrenMetatadata(APIView): ...@@ -213,7 +213,7 @@ class NodesChildrenMetatadata(APIView):
# query metadata keys # query metadata keys
ParentNode = aliased(Node) ParentNode = aliased(Node)
metadata_query = (Metadata metadata_query = (session
.query(Metadata) .query(Metadata)
.join(Node_Metadata, Node_Metadata.metadata_id == Metadata.id) .join(Node_Metadata, Node_Metadata.metadata_id == Metadata.id)
.join(Node, Node.id == Node_Metadata.node_id) .join(Node, Node.id == Node_Metadata.node_id)
...@@ -233,7 +233,7 @@ class NodesChildrenMetatadata(APIView): ...@@ -233,7 +233,7 @@ class NodesChildrenMetatadata(APIView):
values_to = None values_to = None
if metadata.type != 'text': if metadata.type != 'text':
value_column = getattr(Node_Metadata, 'value_' + metadata.type) value_column = getattr(Node_Metadata, 'value_' + metadata.type)
node_metadata_query = (Node_Metadata node_metadata_query = (session
.query(value_column) .query(value_column)
.join(Node, Node.id == Node_Metadata.node_id) .join(Node, Node.id == Node_Metadata.node_id)
.filter(Node.parent_id == node_id) .filter(Node.parent_id == node_id)
...@@ -381,9 +381,9 @@ class NodesChildrenQueries(APIView): ...@@ -381,9 +381,9 @@ class NodesChildrenQueries(APIView):
for field_name in fields_names: for field_name in fields_names:
split_field_name = field_name.split('.') split_field_name = field_name.split('.')
if split_field_name[0] == 'metadata': if split_field_name[0] == 'metadata':
metadata = Metadata.query(Metadata).filter(Metadata.name == split_field_name[1]).first() metadata = session.query(Metadata).filter(Metadata.name == split_field_name[1]).first()
if metadata is None: if metadata is None:
metadata_query = Metadata.query(Metadata.name).order_by(Metadata.name) metadata_query = session.query(Metadata.name).order_by(Metadata.name)
metadata_names = [metadata.name for metadata in metadata_query.all()] metadata_names = [metadata.name for metadata in metadata_query.all()]
raise APIException('Invalid key for "%s" in parameter "field", should be one of the following values: "%s". "%s" was found instead' % (field[0], '", "'.join(metadata_names), field[1]), 400) raise APIException('Invalid key for "%s" in parameter "field", should be one of the following values: "%s". "%s" was found instead' % (field[0], '", "'.join(metadata_names), field[1]), 400)
# check or create Node_Metadata alias; join if necessary # check or create Node_Metadata alias; join if necessary
...@@ -422,7 +422,7 @@ class NodesChildrenQueries(APIView): ...@@ -422,7 +422,7 @@ class NodesChildrenQueries(APIView):
) )
# starting the query! # starting the query!
document_type_id = NodeType.query(NodeType.id).filter(NodeType.name == 'Document').scalar() document_type_id = session.query(NodeType.id).filter(NodeType.name == 'Document').scalar()
query = (session query = (session
.query(*fields_list) .query(*fields_list)
.select_from(Node) .select_from(Node)
...@@ -451,9 +451,9 @@ class NodesChildrenQueries(APIView): ...@@ -451,9 +451,9 @@ class NodesChildrenQueries(APIView):
# #
if field[0] == 'metadata': if field[0] == 'metadata':
# which metadata? # which metadata?
metadata = Metadata.query(Metadata).filter(Metadata.name == field[1]).first() metadata = session.query(Metadata).filter(Metadata.name == field[1]).first()
if metadata is None: if metadata is None:
metadata_query = Metadata.query(Metadata.name).order_by(Metadata.name) metadata_query = session.query(Metadata.name).order_by(Metadata.name)
metadata_names = [metadata.name for metadata in metadata_query.all()] metadata_names = [metadata.name for metadata in metadata_query.all()]
raise APIException('Invalid key for "%s" in parameter "field", should be one of the following values: "%s". "%s" was found instead' % (field[0], '", "'.join(metadata_names), field[1]), 400) raise APIException('Invalid key for "%s" in parameter "field", should be one of the following values: "%s". "%s" was found instead' % (field[0], '", "'.join(metadata_names), field[1]), 400)
# check or create Node_Metadata alias; join if necessary # check or create Node_Metadata alias; join if necessary
...@@ -475,7 +475,7 @@ class NodesChildrenQueries(APIView): ...@@ -475,7 +475,7 @@ class NodesChildrenQueries(APIView):
)) ))
elif field[0] == 'ngrams': elif field[0] == 'ngrams':
query = query.filter( query = query.filter(
Node.id.in_(Node_Metadata Node.id.in_(session
.query(Node_Ngram.node_id) .query(Node_Ngram.node_id)
.filter(Node_Ngram.ngram_id == Ngram.id) .filter(Node_Ngram.ngram_id == Ngram.id)
.filter(operator( .filter(operator(
...@@ -551,7 +551,7 @@ class NodesChildrenQueries(APIView): ...@@ -551,7 +551,7 @@ class NodesChildrenQueries(APIView):
class NodesList(APIView): class NodesList(APIView):
def get(self, request): def get(self, request):
query = (Node query = (session
.query(Node.id, Node.name, NodeType.name.label('type')) .query(Node.id, Node.name, NodeType.name.label('type'))
.filter(Node.user_id == request.session._session_cache['_auth_user_id']) .filter(Node.user_id == request.session._session_cache['_auth_user_id'])
.join(NodeType) .join(NodeType)
...@@ -626,7 +626,7 @@ class CorpusController: ...@@ -626,7 +626,7 @@ class CorpusController:
# build query # build query
ParentNode = aliased(Node) ParentNode = aliased(Node)
query = (Ngram query = (session
.query(Ngram.terms, func.count('*')) .query(Ngram.terms, func.count('*'))
.join(Node_Ngram, Node_Ngram.ngram_id == Ngram.id) .join(Node_Ngram, Node_Ngram.ngram_id == Ngram.id)
.join(Node, Node.id == Node_Ngram.node_id) .join(Node, Node.id == Node_Ngram.node_id)
......
from node import models
from gargantext_web import settings from gargantext_web import settings
from node import models
__all__ = ['literalquery', 'session', 'cache', 'Session', 'bulk_insert', 'engine', 'get_cursor']
__all__ = ['literalquery', 'session', 'cache'] # initialize sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}/{NAME}'.format(
**settings.DATABASES['default']
))
Base = automap_base()
Base.prepare(engine, reflect=True)
# model representation
def model_repr(modelname):
def _repr(obj):
result = '<' + modelname
isfirst = True
for key, value in obj.__dict__.items():
if key[0] != '_':
value = repr(value)
if len(value) > 64:
value = value[:30] + '....' + value[-30:]
if isfirst:
isfirst = False
else:
result += ','
result += ' ' + key + '=' + value
result += '>'
return result
return _repr
# map the Django models found in node.models to SQLAlchemy models # map the Django models found in node.models to SQLAlchemy models
for model_name, model in models.__dict__.items(): for model_name, model in models.__dict__.items():
if hasattr(model, 'sa'): if hasattr(model, '_meta'):
globals()[model_name] = model.sa table_name = model._meta.db_table
__all__.append(model_name) if hasattr(Base.classes, table_name):
sqla_model = getattr(Base.classes, table_name)
setattr(sqla_model, '__repr__', model_repr(model_name))
globals()[model_name] = sqla_model
__all__.append(model_name)
NodeNgram = Node_Ngram NodeNgram = Node_Ngram
...@@ -61,16 +97,17 @@ def literalquery(statement, dialect=None): ...@@ -61,16 +97,17 @@ def literalquery(statement, dialect=None):
# SQLAlchemy session management # SQLAlchemy session management
def get_sessionmaker(): def get_engine():
from django.db import connections
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine from sqlalchemy import create_engine
alias = 'default'
connection = connections[alias]
url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}/{NAME}'.format( url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}/{NAME}'.format(
**settings.DATABASES['default'] **settings.DATABASES['default']
) )
engine = create_engine(url, use_native_hstore=True) return create_engine(url, use_native_hstore=True)
engine = get_engine()
def get_sessionmaker():
from sqlalchemy.orm import sessionmaker
return sessionmaker(bind=engine) return sessionmaker(bind=engine)
Session = get_sessionmaker() Session = get_sessionmaker()
...@@ -84,7 +121,7 @@ from sqlalchemy import or_ ...@@ -84,7 +121,7 @@ from sqlalchemy import or_
class ModelCache(dict): class ModelCache(dict):
def __init__(self, model, preload=False): def __init__(self, model, preload=False):
self._model = model.sa self._model = globals()[model.__name__]
self._columns_names = [column.name for column in model._meta.fields if column.unique] self._columns_names = [column.name for column in model._meta.fields if column.unique]
self._columns = [getattr(self._model, column_name) for column_name in self._columns_names] self._columns = [getattr(self._model, column_name) for column_name in self._columns_names]
self._columns_validators = [] self._columns_validators = []
...@@ -92,20 +129,17 @@ class ModelCache(dict): ...@@ -92,20 +129,17 @@ class ModelCache(dict):
self.preload() self.preload()
def __missing__(self, key): def __missing__(self, key):
for column in self._columns: conditions = [
conditions = [] (column == key)
try: for column in self._columns
formatted_key = column.type.python_type(key) if key.__class__ == column.type.python_type
conditions.append(column == key) ]
except ValueError: if len(conditions) == 0:
pass raise KeyError
if formatted_key in self: element = session.query(self._model).filter(or_(*conditions)).first()
self[key] = self[formatted_key] if element is None:
else: raise KeyError
element = session.query(self._model).filter(or_(*conditions)).first() self[key] = element
if element is None:
raise KeyError
self[key] = element
return element return element
def preload(self): def preload(self):
...@@ -127,3 +161,48 @@ class Cache: ...@@ -127,3 +161,48 @@ class Cache:
return modelcache return modelcache
cache = Cache() cache = Cache()
# Insert many elements at once
import psycopg2
def get_cursor():
db_settings = settings.DATABASES['default']
db = psycopg2.connect(**{
'database': db_settings['NAME'],
'user': db_settings['USER'],
'password': db_settings['PASSWORD'],
'host': db_settings['HOST'],
})
return db, db.cursor()
class bulk_insert:
def __init__(self, table, keys, data, cursor=None):
# prepare the iterator
self.iter = iter(data)
# template
self.template = '%s' + (len(keys) - 1) * '\t%s' + '\n'
# prepare the cursor
if cursor is None:
db, cursor = get_cursor()
mustcommit = True
else:
mustcommit = False
# insert data
if not isinstance(table, str):
table = table.__table__.name
cursor.copy_from(self, table, columns=keys)
# commit if necessary
if mustcommit:
db.commit()
def read(self, size=None):
try:
return self.template % next(self.iter)
except StopIteration:
return ''
readline = read
...@@ -3,7 +3,7 @@ from django.conf.urls import patterns, include, url ...@@ -3,7 +3,7 @@ from django.conf.urls import patterns, include, url
from django.contrib import admin from django.contrib import admin
from django.contrib.auth.views import login from django.contrib.auth.views import login
from gargantext_web import views from gargantext_web import views, views_optimized
import gargantext_web.api import gargantext_web.api
import scrap_pubmed.views as pubmedscrapper import scrap_pubmed.views as pubmedscrapper
...@@ -31,7 +31,7 @@ urlpatterns = patterns('', ...@@ -31,7 +31,7 @@ urlpatterns = patterns('',
# Project Management # Project Management
url(r'^projects/$', views.projects), url(r'^projects/$', views.projects),
url(r'^project/(\d+)/delete/$', views.delete_project), url(r'^project/(\d+)/delete/$', views.delete_project),
url(r'^project/(\d+)/$', views.project), url(r'^project/(\d+)/$', views_optimized.project),
# Corpus management # Corpus management
url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus), url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus),
......
from django.shortcuts import redirect
from django.shortcuts import render
from django.http import Http404, HttpResponse, HttpResponseRedirect
from sqlalchemy import func
from sqlalchemy.orm import aliased
from collections import defaultdict
from datetime import datetime
from node.admin import CustomForm
from gargantext_web.db import *
from gargantext_web.settings import DEBUG, MEDIA_ROOT
from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
def project(request, project_id):
# SQLAlchemy session
session = Session()
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.type_id == cache.NodeType['Project'].id)
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
# Let's find out about the children nodes of the project
ChildrenNode = aliased(Node)
corpus_query = (session
.query(Node, Resource, func.count(ChildrenNode.id))
.outerjoin(ChildrenNode, ChildrenNode.parent_id == Node.id)
.outerjoin(Node_Resource, Node_Resource.node_id == Node.id)
.outerjoin(Resource, Resource.id == Node_Resource.resource_id)
.filter(Node.parent_id == project.id)
.group_by(Node, Resource)
.order_by(Node.name)
)
corpora_by_resourcetype = defaultdict(list)
documents_count_by_resourcetype = defaultdict(int)
corpora_count = 0
for corpus, resource, document_count in corpus_query:
if resource is None:
resourcetype_name = '(no resource)'
else:
resourcetype = cache.ResourceType[resource.type_id]
resourcetype_name = resourcetype.name
corpora_by_resourcetype[resourcetype_name].append({
'id': corpus.id,
'name': corpus.name,
'count': document_count,
})
documents_count_by_resourcetype[resourcetype_name] += document_count
corpora_count += 1
# do the donut
total_documents_count = sum(documents_count_by_resourcetype.values())
donut = [
{ 'source': key,
'count': value,
'part' : round(value * 100 / total_documents_count) if total_documents_count else 0,
}
for key, value in documents_count_by_resourcetype.items()
]
# deal with the form
if request.method == 'POST':
# fomr validation
form = CustomForm(request.POST, request.FILES)
if form.is_valid():
# extract information from the form
name = form.cleaned_data['name']
thefile = form.cleaned_data['file']
resourcetype = cache.ResourceType[form.cleaned_data['type']]
# which default language shall be used?
if resourcetype.name == "europress_french":
language_id = cache.Language['fr'].id
elif resourcetype.name == "europress_english":
language_id = cache.Language['en'].id
else:
language_id = None
# corpus node instanciation as a Django model
corpus = Node(
name = name,
user_id = request.user.id,
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = language_id,
)
session.add(corpus)
session.commit()
# save the uploaded file
filepath = '%s/corpora/%s/%s' % (MEDIA_ROOT, request.user.username, thefile._name)
f = open(filepath, 'wb')
f.write(thefile.read())
f.close()
# add the uploaded resource to the corpus
add_resource(corpus,
user_id = request.user.id,
type_id = resourcetype.id,
file = filepath,
)
# let's start the workflow
try:
parse_resources(corpus)
extract_ngrams(corpus, ['title'])
compute_tfidf(corpus)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
# redirect to the main project page
return HttpResponseRedirect('/project/' + str(project_id))
else:
print('ERROR: BAD FORM')
else:
form = CustomForm()
# HTML output
return render(request, 'project.html', {
'form' : form,
'user' : user,
'date' : datetime.now(),
'project' : project,
'donut' : donut,
'list_corpora' : dict(corpora_by_resourcetype),
'whitelists' : '',
'blacklists' : '',
'cooclists' : '',
'number' : corpora_count,
})
...@@ -70,7 +70,7 @@ class Resource(models.Model): ...@@ -70,7 +70,7 @@ class Resource(models.Model):
return self.file return self.file
class NodeType(models.Model): class NodeType(models.Model):
name = models.CharField(max_length=200, unique=True) name = models.CharField(max_length=255, unique=True)
def __str__(self): def __str__(self):
return self.name return self.name
...@@ -111,7 +111,7 @@ class NodeManager(CTENodeManager): ...@@ -111,7 +111,7 @@ class NodeManager(CTENodeManager):
return getattr(self.get_queryset(), name, *args) return getattr(self.get_queryset(), name, *args)
class Metadata(models.Model): class Metadata(models.Model):
name = models.CharField(max_length=32, db_index=True) name = models.CharField(max_length=32, unique=True)
type = models.CharField(max_length=16, db_index=True) type = models.CharField(max_length=16, db_index=True)
class Node(CTENode): class Node(CTENode):
...@@ -120,7 +120,7 @@ class Node(CTENode): ...@@ -120,7 +120,7 @@ class Node(CTENode):
user = models.ForeignKey(User) user = models.ForeignKey(User)
type = models.ForeignKey(NodeType) type = models.ForeignKey(NodeType)
name = models.CharField(max_length=200) name = models.CharField(max_length=255)
language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL) language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL)
...@@ -189,7 +189,7 @@ class Node(CTENode): ...@@ -189,7 +189,7 @@ class Node(CTENode):
for i, metadata_values in enumerate(metadata_list): for i, metadata_values in enumerate(metadata_list):
if verbose: if verbose:
print(i, end='\r', flush=True) print(i, end='\r', flush=True)
name = metadata_values.get('title', '')[:200] name = metadata_values.get('title', '')[:255]
language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None, language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None,
if isinstance(language, tuple): if isinstance(language, tuple):
language = language[0] language = language[0]
......
...@@ -48,11 +48,9 @@ class EuropressFileParser(FileParser): ...@@ -48,11 +48,9 @@ class EuropressFileParser(FileParser):
print(error) print(error)
except: except Exception as error:
return [] print(error)
# initialize the list of metadata
metadata_list = []
# parse all the articles, one by one # parse all the articles, one by one
try: try:
for html_article in html_articles: for html_article in html_articles:
...@@ -201,16 +199,9 @@ class EuropressFileParser(FileParser): ...@@ -201,16 +199,9 @@ class EuropressFileParser(FileParser):
#metadata_str = {} #metadata_str = {}
for key, value in metadata.items(): for key, value in metadata.items():
metadata[key] = value.decode() if isinstance(value, bytes) else value metadata[key] = value.decode() if isinstance(value, bytes) else value
metadata_list.append(metadata) yield metadata
count += 1 count += 1
except Exception as error: except Exception as error:
print(error) print(error)
pass pass
# from pprint import pprint
# pprint(metadata_list)
# return []
return metadata_list
...@@ -103,15 +103,21 @@ class FileParser: ...@@ -103,15 +103,21 @@ class FileParser:
zipArchive = zipfile.ZipFile(file) zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist(): for filename in zipArchive.namelist():
try: try:
metadata_list += self.parse(zipArchive.open(filename, "r")) f = zipArchive.open(filename, 'r')
metadata_list += self.parse(f)
f.close()
except Exception as error: except Exception as error:
print(error) print(error)
# ...otherwise, let's parse it directly! # ...otherwise, let's parse it directly!
else: else:
try: try:
metadata_list += self._parse(file) for metadata in self._parse(file):
metadata_list.append(self.format_metadata(metadata))
if hasattr(file, 'close'):
file.close()
except Exception as error: except Exception as error:
print(error) print(error)
# return the list of formatted metadata # return the list of formatted metadata
return map(self.format_metadata, metadata_list) return metadata_list
...@@ -2,22 +2,14 @@ from django.db import transaction ...@@ -2,22 +2,14 @@ from django.db import transaction
from lxml import etree from lxml import etree
from .FileParser import FileParser from .FileParser import FileParser
from ..NgramsExtractors import * from ..NgramsExtractors import *
from datetime import datetime
from io import BytesIO
class PubmedFileParser(FileParser): class PubmedFileParser(FileParser):
def _parse(self, file): def _parse(self, file):
# open the file as XML # open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True) xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
xml = etree.parse(file, parser=xml_parser)
xml = ""
if type(file)==bytes: xml = etree.parse( BytesIO(file) , parser=xml_parser)
else: xml = etree.parse(file, parser=xml_parser)
xml_articles = xml.findall('PubmedArticle') xml_articles = xml.findall('PubmedArticle')
# initialize the list of metadata
metadata_list = []
# parse all the articles, one by one # parse all the articles, one by one
for xml_article in xml_articles: for xml_article in xml_articles:
# extract data from the document # extract data from the document
...@@ -25,14 +17,9 @@ class PubmedFileParser(FileParser): ...@@ -25,14 +17,9 @@ class PubmedFileParser(FileParser):
metadata_path = { metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title', "journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle', "title" : 'MedlineCitation/Article/ArticleTitle',
# "abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"title2" : 'MedlineCitation/Article/VernacularTitle',
"language_iso3" : 'MedlineCitation/Article/Language', "language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]', "doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
"realdate_full_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate', "abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"realdate_year_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Year',
"realdate_month_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Month',
"realdate_day_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Day',
"publication_year" : 'MedlineCitation/DateCreated/Year', "publication_year" : 'MedlineCitation/DateCreated/Year',
"publication_month" : 'MedlineCitation/DateCreated/Month', "publication_month" : 'MedlineCitation/DateCreated/Month',
"publication_day" : 'MedlineCitation/DateCreated/Day', "publication_day" : 'MedlineCitation/DateCreated/Day',
...@@ -41,7 +28,6 @@ class PubmedFileParser(FileParser): ...@@ -41,7 +28,6 @@ class PubmedFileParser(FileParser):
for key, path in metadata_path.items(): for key, path in metadata_path.items():
try: try:
xml_node = xml_article.find(path) xml_node = xml_article.find(path)
# Authors tag
if key == 'authors': if key == 'authors':
metadata[key] = ', '.join([ metadata[key] = ', '.join([
xml_author.find('ForeName').text + ' ' + xml_author.find('LastName').text xml_author.find('ForeName').text + ' ' + xml_author.find('LastName').text
...@@ -49,61 +35,6 @@ class PubmedFileParser(FileParser): ...@@ -49,61 +35,6 @@ class PubmedFileParser(FileParser):
]) ])
else: else:
metadata[key] = xml_node.text metadata[key] = xml_node.text
except: except:
pass pass
yield metadata
#Title-Decision \ No newline at end of file
Title=""
if not metadata["title"] or metadata["title"]=="":
if "title2" in metadata:
metadata["title"] = metadata["title2"]
else: metadata["title"] = ""
# Date-Decision
# forge.iscpif.fr/issues/1418
RealDate = ""
if "realdate_full_" in metadata:
RealDate = metadata["realdate_full_"]
else:
if "realdate_year_" in metadata: RealDate+=metadata["realdate_year_"]
if "realdate_month_" in metadata: RealDate+=" "+metadata["realdate_month_"]
if "realdate_day_" in metadata: RealDate+=" "+metadata["realdate_day_"]
metadata["realdate_full_"] = RealDate
RealDate = RealDate.split("-")[0]
PubmedDate = ""
if "publication_year" in metadata: PubmedDate+=metadata["publication_year"]
if "publication_month" in metadata: PubmedDate+=" "+metadata["publication_month"]
if "publication_day" in metadata: PubmedDate+=" "+metadata["publication_day"]
Decision=""
if len(RealDate)>4:
if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()
except:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
else:
try: Decision = datetime.strptime(RealDate, '%Y %b').date()
except:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
else:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
if Decision!=False:
if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year)
if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month)
if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day)
if "realdate_year_" in metadata: metadata.pop("realdate_year_")
if "realdate_month_" in metadata: metadata.pop("realdate_month_")
if "realdate_day_" in metadata: metadata.pop("realdate_day_")
if "title2" in metadata: metadata.pop("title2")
# print(metadata)
# print("* * * * ** * * * * ")
metadata_list.append(metadata)
# return the list of metadata
return metadata_list
...@@ -17,42 +17,34 @@ class RisFileParser(FileParser): ...@@ -17,42 +17,34 @@ class RisFileParser(FileParser):
} }
def _parse(self, file): def _parse(self, file):
metadata_list = []
metadata = {} metadata = {}
last_key = None last_key = None
last_values = [] last_values = []
# browse every line of the file
for line in file: for line in file:
if len(line) > 2: if len(line) > 2:
# extract the parameter key
parameter_key = line[:2] parameter_key = line[:2]
# print(parameter_key)
if parameter_key != b' ' and parameter_key != last_key: if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters: if last_key in self._parameters:
# translate the parameter key
parameter = self._parameters[last_key] parameter = self._parameters[last_key]
if parameter["type"] == "metadata": if parameter["type"] == "metadata":
separator = parameter["separator"] if "separator" in parameter else "" separator = parameter["separator"] if "separator" in parameter else ""
metadata[parameter["key"]] = separator.join(last_values) metadata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter": elif parameter["type"] == "delimiter":
#language = self._languages_fullname[metadata["language"].lower()] if 'language_fullname' not in metadata.keys():
#print(metadata) if 'language_iso3' not in metadata.keys():
try: if 'language_iso2' not in metadata.keys():
#print("append") metadata['language_iso2'] = 'en'
if 'language_fullname' not in metadata.keys(): yield metadata
if 'language_iso3' not in metadata.keys(): metadata = {}
if 'language_iso2' not in metadata.keys():
metadata['language_iso2'] = 'en'
metadata_list.append(metadata)
metadata = {}
#print("append succeeded")
except:
pass
last_key = parameter_key last_key = parameter_key
last_values = [] last_values = []
try: try:
last_values.append(line[3:-1].decode()) last_values.append(line[3:-1].decode())
except Exception as error: except Exception as error:
print(error) print(error)
pass # if a metadata object is left in memory, yield it as well
#print(len(metadata_list)) if metadata:
#print(metadata_list) yield metadata
return metadata_list
...@@ -13,6 +13,7 @@ class NgramsExtractor: ...@@ -13,6 +13,7 @@ class NgramsExtractor:
self.start() self.start()
self._label = "NP" self._label = "NP"
self._rule = self._label + ": " + rule self._rule = self._label + ": " + rule
self._grammar = nltk.RegexpParser(self._rule)
def __del__(self): def __del__(self):
self.stop() self.stop()
...@@ -29,19 +30,8 @@ class NgramsExtractor: ...@@ -29,19 +30,8 @@ class NgramsExtractor:
""" """
def extract_ngrams(self, contents): def extract_ngrams(self, contents):
tagged_ngrams = self.tagger.tag_text(contents) tagged_ngrams = self.tagger.tag_text(contents)
if len(tagged_ngrams)==0: return [] if len(tagged_ngrams):
grammar_parsed = self._grammar.parse(tagged_ngrams)
grammar = nltk.RegexpParser(self._rule) for subtree in grammar_parsed.subtrees():
result = [] if subtree.label() == self._label:
# try: yield subtree.leaves()
grammar_parsed = grammar.parse(tagged_ngrams)
for subtree in grammar_parsed.subtrees():
if subtree.label() == self._label:
result.append(subtree.leaves())
# except Exception as e:
# print("Problem while parsing rule '%s'" % (self._rule, ))
# print(e)
return result
from .FrenchNgramsExtractor import FrenchNgramsExtractor from .FrenchNgramsExtractor import FrenchNgramsExtractor
from .TurboNgramsExtractor import TurboNgramsExtractor as EnglishNgramsExtractor from .TurboNgramsExtractor import TurboNgramsExtractor as EnglishNgramsExtractor
# from parsing.NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor # from .EnglishNgramsExtractor import EnglishNgramsExtractor
from .NgramsExtractor import NgramsExtractor from .NgramsExtractor import NgramsExtractor
...@@ -71,4 +71,3 @@ class Tagger: ...@@ -71,4 +71,3 @@ class Tagger:
tokens_tags += self.tag_tokens(tokens, False) tokens_tags += self.tag_tokens(tokens, False)
self.tagging_end() self.tagging_end()
return tokens_tags return tokens_tags
...@@ -9,15 +9,24 @@ from .settings import implemented_methods ...@@ -9,15 +9,24 @@ from .settings import implemented_methods
class NLPClient: class NLPClient:
def __init__(self): def __init__(self):
self._socket = socket.socket(*server_type_client) self._socket = None
self._socket.connect((server_host, server_port))
for method_name in dir(self): for method_name in dir(self):
if method_name[0] != '_': if method_name[0] != '_':
if method_name.upper() not in implemented_methods: if method_name.upper() not in implemented_methods:
setattr(self, method_name, self._notimplemented) setattr(self, method_name, self._notimplemented)
def __del__(self): def __del__(self):
self._socket.close() self._disconnect()
def _connect(self):
self._disconnect()
self._socket = socket.socket(*server_type_client)
self._socket.connect((server_host, server_port))
def _disconnect(self):
if self._socket is not None:
self._socket.close()
self._socket = None
def _notimplemented(self, *args, **kwargs): def _notimplemented(self, *args, **kwargs):
raise NotImplementedError( raise NotImplementedError(
...@@ -51,7 +60,7 @@ class NLPClient: ...@@ -51,7 +60,7 @@ class NLPClient:
data += language + '\n' data += language + '\n'
data += re.sub(r'\n+', '\n', text) data += re.sub(r'\n+', '\n', text)
data += '\n\n' data += '\n\n'
self.__init__() self._connect()
self._socket.sendall(data.encode()) self._socket.sendall(data.encode())
sentence = [] sentence = []
if keys is None: if keys is None:
...@@ -73,7 +82,6 @@ class NLPClient: ...@@ -73,7 +82,6 @@ class NLPClient:
continue continue
values = line.split('\t') values = line.split('\t')
sentence.append(dict(zip(keys, line.split('\t')))) sentence.append(dict(zip(keys, line.split('\t'))))
self.__del__()
def tokenize(self, text, language='english', asdict=False): def tokenize(self, text, language='english', asdict=False):
keys = ('token', ) if asdict else None keys = ('token', ) if asdict else None
......
...@@ -4,7 +4,7 @@ import socketserver ...@@ -4,7 +4,7 @@ import socketserver
# Server parameters # Server parameters
server_host = 'localhost' server_host = 'localhost'
server_port = 1234 server_port = 7777
server_type_server = socketserver.TCPServer server_type_server = socketserver.TCPServer
server_type_client = socket.AF_INET, socket.SOCK_STREAM server_type_client = socket.AF_INET, socket.SOCK_STREAM
server_timeout = 2.0 server_timeout = 2.0
......
This diff is collapsed.
# Without this, we couldn't use the Django environment
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext_web.settings")
os.environ.setdefault("DJANGO_HSTORE_GLOBAL_REGISTER", "False")
# database tools
from gargantext_web.db import *
from parsing.corpustools import *
user = session.query(User).first()
project = session.query(Node).filter(Node.name == 'A').first()
corpus = Node(
parent_id = project.id,
name = 'Test 456',
type_id = cache.NodeType['Corpus'].id,
user_id = user.id,
)
session.add(corpus)
session.commit()
add_resource(corpus,
# file = './data_samples/pubmed_result.xml',
file = './data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
type_id = cache.ResourceType['pubmed'].id,
)
parse_resources(corpus)
extract_ngrams(corpus, ('title', ))
# print(corpus)
# corpus = session.query(Node).filter(Node.id == 72771).first()
# corpus = session.query(Node).filter(Node.id == 73017).first()
compute_tfidf(corpus)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment