Commit 06a5ba47 authored by PkSM3's avatar PkSM3

[MERGE] maybe this works?

parents f9a08e53 b57ae7fe
...@@ -82,7 +82,7 @@ class NodesChildrenNgrams(APIView): ...@@ -82,7 +82,7 @@ class NodesChildrenNgrams(APIView):
def get(self, request, node_id): def get(self, request, node_id):
# query ngrams # query ngrams
ParentNode = aliased(Node) ParentNode = aliased(Node)
ngrams_query = (Ngram ngrams_query = (session
.query(Ngram.terms, func.count().label('count')) .query(Ngram.terms, func.count().label('count'))
# .query(Ngram.id, Ngram.terms, func.count().label('count')) # .query(Ngram.id, Ngram.terms, func.count().label('count'))
.join(Node_Ngram, Node_Ngram.ngram_id == Ngram.id) .join(Node_Ngram, Node_Ngram.ngram_id == Ngram.id)
...@@ -128,7 +128,7 @@ class NodesChildrenDuplicates(APIView): ...@@ -128,7 +128,7 @@ class NodesChildrenDuplicates(APIView):
raise APIException('Missing GET parameter: "keys"', 400) raise APIException('Missing GET parameter: "keys"', 400)
keys = request.GET['keys'].split(',') keys = request.GET['keys'].split(',')
# metadata retrieval # metadata retrieval
metadata_query = (Metadata metadata_query = (session
.query(Metadata) .query(Metadata)
.filter(Metadata.name.in_(keys)) .filter(Metadata.name.in_(keys))
) )
...@@ -213,7 +213,7 @@ class NodesChildrenMetatadata(APIView): ...@@ -213,7 +213,7 @@ class NodesChildrenMetatadata(APIView):
# query metadata keys # query metadata keys
ParentNode = aliased(Node) ParentNode = aliased(Node)
metadata_query = (Metadata metadata_query = (session
.query(Metadata) .query(Metadata)
.join(Node_Metadata, Node_Metadata.metadata_id == Metadata.id) .join(Node_Metadata, Node_Metadata.metadata_id == Metadata.id)
.join(Node, Node.id == Node_Metadata.node_id) .join(Node, Node.id == Node_Metadata.node_id)
...@@ -233,7 +233,7 @@ class NodesChildrenMetatadata(APIView): ...@@ -233,7 +233,7 @@ class NodesChildrenMetatadata(APIView):
values_to = None values_to = None
if metadata.type != 'text': if metadata.type != 'text':
value_column = getattr(Node_Metadata, 'value_' + metadata.type) value_column = getattr(Node_Metadata, 'value_' + metadata.type)
node_metadata_query = (Node_Metadata node_metadata_query = (session
.query(value_column) .query(value_column)
.join(Node, Node.id == Node_Metadata.node_id) .join(Node, Node.id == Node_Metadata.node_id)
.filter(Node.parent_id == node_id) .filter(Node.parent_id == node_id)
...@@ -381,9 +381,9 @@ class NodesChildrenQueries(APIView): ...@@ -381,9 +381,9 @@ class NodesChildrenQueries(APIView):
for field_name in fields_names: for field_name in fields_names:
split_field_name = field_name.split('.') split_field_name = field_name.split('.')
if split_field_name[0] == 'metadata': if split_field_name[0] == 'metadata':
metadata = Metadata.query(Metadata).filter(Metadata.name == split_field_name[1]).first() metadata = session.query(Metadata).filter(Metadata.name == split_field_name[1]).first()
if metadata is None: if metadata is None:
metadata_query = Metadata.query(Metadata.name).order_by(Metadata.name) metadata_query = session.query(Metadata.name).order_by(Metadata.name)
metadata_names = [metadata.name for metadata in metadata_query.all()] metadata_names = [metadata.name for metadata in metadata_query.all()]
raise APIException('Invalid key for "%s" in parameter "field", should be one of the following values: "%s". "%s" was found instead' % (field[0], '", "'.join(metadata_names), field[1]), 400) raise APIException('Invalid key for "%s" in parameter "field", should be one of the following values: "%s". "%s" was found instead' % (field[0], '", "'.join(metadata_names), field[1]), 400)
# check or create Node_Metadata alias; join if necessary # check or create Node_Metadata alias; join if necessary
...@@ -422,7 +422,7 @@ class NodesChildrenQueries(APIView): ...@@ -422,7 +422,7 @@ class NodesChildrenQueries(APIView):
) )
# starting the query! # starting the query!
document_type_id = NodeType.query(NodeType.id).filter(NodeType.name == 'Document').scalar() document_type_id = session.query(NodeType.id).filter(NodeType.name == 'Document').scalar()
query = (session query = (session
.query(*fields_list) .query(*fields_list)
.select_from(Node) .select_from(Node)
...@@ -451,9 +451,9 @@ class NodesChildrenQueries(APIView): ...@@ -451,9 +451,9 @@ class NodesChildrenQueries(APIView):
# #
if field[0] == 'metadata': if field[0] == 'metadata':
# which metadata? # which metadata?
metadata = Metadata.query(Metadata).filter(Metadata.name == field[1]).first() metadata = session.query(Metadata).filter(Metadata.name == field[1]).first()
if metadata is None: if metadata is None:
metadata_query = Metadata.query(Metadata.name).order_by(Metadata.name) metadata_query = session.query(Metadata.name).order_by(Metadata.name)
metadata_names = [metadata.name for metadata in metadata_query.all()] metadata_names = [metadata.name for metadata in metadata_query.all()]
raise APIException('Invalid key for "%s" in parameter "field", should be one of the following values: "%s". "%s" was found instead' % (field[0], '", "'.join(metadata_names), field[1]), 400) raise APIException('Invalid key for "%s" in parameter "field", should be one of the following values: "%s". "%s" was found instead' % (field[0], '", "'.join(metadata_names), field[1]), 400)
# check or create Node_Metadata alias; join if necessary # check or create Node_Metadata alias; join if necessary
...@@ -475,7 +475,7 @@ class NodesChildrenQueries(APIView): ...@@ -475,7 +475,7 @@ class NodesChildrenQueries(APIView):
)) ))
elif field[0] == 'ngrams': elif field[0] == 'ngrams':
query = query.filter( query = query.filter(
Node.id.in_(Node_Metadata Node.id.in_(session
.query(Node_Ngram.node_id) .query(Node_Ngram.node_id)
.filter(Node_Ngram.ngram_id == Ngram.id) .filter(Node_Ngram.ngram_id == Ngram.id)
.filter(operator( .filter(operator(
...@@ -551,7 +551,7 @@ class NodesChildrenQueries(APIView): ...@@ -551,7 +551,7 @@ class NodesChildrenQueries(APIView):
class NodesList(APIView): class NodesList(APIView):
def get(self, request): def get(self, request):
query = (Node query = (session
.query(Node.id, Node.name, NodeType.name.label('type')) .query(Node.id, Node.name, NodeType.name.label('type'))
.filter(Node.user_id == request.session._session_cache['_auth_user_id']) .filter(Node.user_id == request.session._session_cache['_auth_user_id'])
.join(NodeType) .join(NodeType)
...@@ -626,7 +626,7 @@ class CorpusController: ...@@ -626,7 +626,7 @@ class CorpusController:
# build query # build query
ParentNode = aliased(Node) ParentNode = aliased(Node)
query = (Ngram query = (session
.query(Ngram.terms, func.count('*')) .query(Ngram.terms, func.count('*'))
.join(Node_Ngram, Node_Ngram.ngram_id == Ngram.id) .join(Node_Ngram, Node_Ngram.ngram_id == Ngram.id)
.join(Node, Node.id == Node_Ngram.node_id) .join(Node, Node.id == Node_Ngram.node_id)
......
from node import models
from gargantext_web import settings from gargantext_web import settings
from node import models
__all__ = ['literalquery', 'session', 'cache', 'Session', 'bulk_insert', 'engine', 'get_cursor']
__all__ = ['literalquery', 'session', 'cache'] # initialize sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}/{NAME}'.format(
**settings.DATABASES['default']
))
Base = automap_base()
Base.prepare(engine, reflect=True)
# model representation
def model_repr(modelname):
def _repr(obj):
result = '<' + modelname
isfirst = True
for key, value in obj.__dict__.items():
if key[0] != '_':
value = repr(value)
if len(value) > 64:
value = value[:30] + '....' + value[-30:]
if isfirst:
isfirst = False
else:
result += ','
result += ' ' + key + '=' + value
result += '>'
return result
return _repr
# map the Django models found in node.models to SQLAlchemy models # map the Django models found in node.models to SQLAlchemy models
for model_name, model in models.__dict__.items(): for model_name, model in models.__dict__.items():
if hasattr(model, 'sa'): if hasattr(model, '_meta'):
globals()[model_name] = model.sa table_name = model._meta.db_table
__all__.append(model_name) if hasattr(Base.classes, table_name):
sqla_model = getattr(Base.classes, table_name)
setattr(sqla_model, '__repr__', model_repr(model_name))
globals()[model_name] = sqla_model
__all__.append(model_name)
NodeNgram = Node_Ngram NodeNgram = Node_Ngram
...@@ -61,16 +97,17 @@ def literalquery(statement, dialect=None): ...@@ -61,16 +97,17 @@ def literalquery(statement, dialect=None):
# SQLAlchemy session management # SQLAlchemy session management
def get_sessionmaker(): def get_engine():
from django.db import connections
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine from sqlalchemy import create_engine
alias = 'default'
connection = connections[alias]
url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}/{NAME}'.format( url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}/{NAME}'.format(
**settings.DATABASES['default'] **settings.DATABASES['default']
) )
engine = create_engine(url, use_native_hstore=True) return create_engine(url, use_native_hstore=True)
engine = get_engine()
def get_sessionmaker():
from sqlalchemy.orm import sessionmaker
return sessionmaker(bind=engine) return sessionmaker(bind=engine)
Session = get_sessionmaker() Session = get_sessionmaker()
...@@ -84,7 +121,7 @@ from sqlalchemy import or_ ...@@ -84,7 +121,7 @@ from sqlalchemy import or_
class ModelCache(dict): class ModelCache(dict):
def __init__(self, model, preload=False): def __init__(self, model, preload=False):
self._model = model.sa self._model = globals()[model.__name__]
self._columns_names = [column.name for column in model._meta.fields if column.unique] self._columns_names = [column.name for column in model._meta.fields if column.unique]
self._columns = [getattr(self._model, column_name) for column_name in self._columns_names] self._columns = [getattr(self._model, column_name) for column_name in self._columns_names]
self._columns_validators = [] self._columns_validators = []
...@@ -92,20 +129,17 @@ class ModelCache(dict): ...@@ -92,20 +129,17 @@ class ModelCache(dict):
self.preload() self.preload()
def __missing__(self, key): def __missing__(self, key):
for column in self._columns: conditions = [
conditions = [] (column == key)
try: for column in self._columns
formatted_key = column.type.python_type(key) if key.__class__ == column.type.python_type
conditions.append(column == key) ]
except ValueError: if len(conditions) == 0:
pass raise KeyError
if formatted_key in self: element = session.query(self._model).filter(or_(*conditions)).first()
self[key] = self[formatted_key] if element is None:
else: raise KeyError
element = session.query(self._model).filter(or_(*conditions)).first() self[key] = element
if element is None:
raise KeyError
self[key] = element
return element return element
def preload(self): def preload(self):
...@@ -127,3 +161,48 @@ class Cache: ...@@ -127,3 +161,48 @@ class Cache:
return modelcache return modelcache
cache = Cache() cache = Cache()
# Insert many elements at once
import psycopg2
def get_cursor():
db_settings = settings.DATABASES['default']
db = psycopg2.connect(**{
'database': db_settings['NAME'],
'user': db_settings['USER'],
'password': db_settings['PASSWORD'],
'host': db_settings['HOST'],
})
return db, db.cursor()
class bulk_insert:
def __init__(self, table, keys, data, cursor=None):
# prepare the iterator
self.iter = iter(data)
# template
self.template = '%s' + (len(keys) - 1) * '\t%s' + '\n'
# prepare the cursor
if cursor is None:
db, cursor = get_cursor()
mustcommit = True
else:
mustcommit = False
# insert data
if not isinstance(table, str):
table = table.__table__.name
cursor.copy_from(self, table, columns=keys)
# commit if necessary
if mustcommit:
db.commit()
def read(self, size=None):
try:
return self.template % next(self.iter)
except StopIteration:
return ''
readline = read
...@@ -3,7 +3,7 @@ from django.conf.urls import patterns, include, url ...@@ -3,7 +3,7 @@ from django.conf.urls import patterns, include, url
from django.contrib import admin from django.contrib import admin
from django.contrib.auth.views import login from django.contrib.auth.views import login
from gargantext_web import views from gargantext_web import views, views_optimized
import gargantext_web.api import gargantext_web.api
import scrap_pubmed.views as pubmedscrapper import scrap_pubmed.views as pubmedscrapper
...@@ -31,7 +31,7 @@ urlpatterns = patterns('', ...@@ -31,7 +31,7 @@ urlpatterns = patterns('',
# Project Management # Project Management
url(r'^projects/$', views.projects), url(r'^projects/$', views.projects),
url(r'^project/(\d+)/delete/$', views.delete_project), url(r'^project/(\d+)/delete/$', views.delete_project),
url(r'^project/(\d+)/$', views.project), url(r'^project/(\d+)/$', views_optimized.project),
# Corpus management # Corpus management
url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus), url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus),
......
from django.shortcuts import redirect
from django.shortcuts import render
from django.http import Http404, HttpResponse, HttpResponseRedirect
from sqlalchemy import func
from sqlalchemy.orm import aliased
from collections import defaultdict
from datetime import datetime
from node.admin import CustomForm
from gargantext_web.db import *
from gargantext_web.settings import DEBUG, MEDIA_ROOT
from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
def project(request, project_id):
# SQLAlchemy session
session = Session()
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.type_id == cache.NodeType['Project'].id)
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
# Let's find out about the children nodes of the project
ChildrenNode = aliased(Node)
corpus_query = (session
.query(Node, Resource, func.count(ChildrenNode.id))
.outerjoin(ChildrenNode, ChildrenNode.parent_id == Node.id)
.outerjoin(Node_Resource, Node_Resource.node_id == Node.id)
.outerjoin(Resource, Resource.id == Node_Resource.resource_id)
.filter(Node.parent_id == project.id)
.group_by(Node, Resource)
.order_by(Node.name)
)
corpora_by_resourcetype = defaultdict(list)
documents_count_by_resourcetype = defaultdict(int)
corpora_count = 0
for corpus, resource, document_count in corpus_query:
if resource is None:
resourcetype_name = '(no resource)'
else:
resourcetype = cache.ResourceType[resource.type_id]
resourcetype_name = resourcetype.name
corpora_by_resourcetype[resourcetype_name].append({
'id': corpus.id,
'name': corpus.name,
'count': document_count,
})
documents_count_by_resourcetype[resourcetype_name] += document_count
corpora_count += 1
# do the donut
total_documents_count = sum(documents_count_by_resourcetype.values())
donut = [
{ 'source': key,
'count': value,
'part' : round(value * 100 / total_documents_count) if total_documents_count else 0,
}
for key, value in documents_count_by_resourcetype.items()
]
# deal with the form
if request.method == 'POST':
# fomr validation
form = CustomForm(request.POST, request.FILES)
if form.is_valid():
# extract information from the form
name = form.cleaned_data['name']
thefile = form.cleaned_data['file']
resourcetype = cache.ResourceType[form.cleaned_data['type']]
# which default language shall be used?
if resourcetype.name == "europress_french":
language_id = cache.Language['fr'].id
elif resourcetype.name == "europress_english":
language_id = cache.Language['en'].id
else:
language_id = None
# corpus node instanciation as a Django model
corpus = Node(
name = name,
user_id = request.user.id,
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = language_id,
)
session.add(corpus)
session.commit()
# save the uploaded file
filepath = '%s/corpora/%s/%s' % (MEDIA_ROOT, request.user.username, thefile._name)
f = open(filepath, 'wb')
f.write(thefile.read())
f.close()
# add the uploaded resource to the corpus
add_resource(corpus,
user_id = request.user.id,
type_id = resourcetype.id,
file = filepath,
)
# let's start the workflow
try:
parse_resources(corpus)
extract_ngrams(corpus, ['title'])
compute_tfidf(corpus)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
# redirect to the main project page
return HttpResponseRedirect('/project/' + str(project_id))
else:
print('ERROR: BAD FORM')
else:
form = CustomForm()
# HTML output
return render(request, 'project.html', {
'form' : form,
'user' : user,
'date' : datetime.now(),
'project' : project,
'donut' : donut,
'list_corpora' : dict(corpora_by_resourcetype),
'whitelists' : '',
'blacklists' : '',
'cooclists' : '',
'number' : corpora_count,
})
...@@ -70,7 +70,7 @@ class Resource(models.Model): ...@@ -70,7 +70,7 @@ class Resource(models.Model):
return self.file return self.file
class NodeType(models.Model): class NodeType(models.Model):
name = models.CharField(max_length=200, unique=True) name = models.CharField(max_length=255, unique=True)
def __str__(self): def __str__(self):
return self.name return self.name
...@@ -111,7 +111,7 @@ class NodeManager(CTENodeManager): ...@@ -111,7 +111,7 @@ class NodeManager(CTENodeManager):
return getattr(self.get_queryset(), name, *args) return getattr(self.get_queryset(), name, *args)
class Metadata(models.Model): class Metadata(models.Model):
name = models.CharField(max_length=32, db_index=True) name = models.CharField(max_length=32, unique=True)
type = models.CharField(max_length=16, db_index=True) type = models.CharField(max_length=16, db_index=True)
class Node(CTENode): class Node(CTENode):
...@@ -120,7 +120,7 @@ class Node(CTENode): ...@@ -120,7 +120,7 @@ class Node(CTENode):
user = models.ForeignKey(User) user = models.ForeignKey(User)
type = models.ForeignKey(NodeType) type = models.ForeignKey(NodeType)
name = models.CharField(max_length=200) name = models.CharField(max_length=255)
language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL) language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL)
...@@ -189,7 +189,7 @@ class Node(CTENode): ...@@ -189,7 +189,7 @@ class Node(CTENode):
for i, metadata_values in enumerate(metadata_list): for i, metadata_values in enumerate(metadata_list):
if verbose: if verbose:
print(i, end='\r', flush=True) print(i, end='\r', flush=True)
name = metadata_values.get('title', '')[:200] name = metadata_values.get('title', '')[:255]
language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None, language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None,
if isinstance(language, tuple): if isinstance(language, tuple):
language = language[0] language = language[0]
......
...@@ -48,11 +48,9 @@ class EuropressFileParser(FileParser): ...@@ -48,11 +48,9 @@ class EuropressFileParser(FileParser):
print(error) print(error)
except: except Exception as error:
return [] print(error)
# initialize the list of metadata
metadata_list = []
# parse all the articles, one by one # parse all the articles, one by one
try: try:
for html_article in html_articles: for html_article in html_articles:
...@@ -201,16 +199,9 @@ class EuropressFileParser(FileParser): ...@@ -201,16 +199,9 @@ class EuropressFileParser(FileParser):
#metadata_str = {} #metadata_str = {}
for key, value in metadata.items(): for key, value in metadata.items():
metadata[key] = value.decode() if isinstance(value, bytes) else value metadata[key] = value.decode() if isinstance(value, bytes) else value
metadata_list.append(metadata) yield metadata
count += 1 count += 1
except Exception as error: except Exception as error:
print(error) print(error)
pass pass
# from pprint import pprint
# pprint(metadata_list)
# return []
return metadata_list
...@@ -103,15 +103,21 @@ class FileParser: ...@@ -103,15 +103,21 @@ class FileParser:
zipArchive = zipfile.ZipFile(file) zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist(): for filename in zipArchive.namelist():
try: try:
metadata_list += self.parse(zipArchive.open(filename, "r")) f = zipArchive.open(filename, 'r')
metadata_list += self.parse(f)
f.close()
except Exception as error: except Exception as error:
print(error) print(error)
# ...otherwise, let's parse it directly! # ...otherwise, let's parse it directly!
else: else:
try: try:
metadata_list += self._parse(file) for metadata in self._parse(file):
metadata_list.append(self.format_metadata(metadata))
if hasattr(file, 'close'):
file.close()
except Exception as error: except Exception as error:
print(error) print(error)
# return the list of formatted metadata # return the list of formatted metadata
return map(self.format_metadata, metadata_list) return metadata_list
...@@ -2,22 +2,14 @@ from django.db import transaction ...@@ -2,22 +2,14 @@ from django.db import transaction
from lxml import etree from lxml import etree
from .FileParser import FileParser from .FileParser import FileParser
from ..NgramsExtractors import * from ..NgramsExtractors import *
from datetime import datetime
from io import BytesIO
class PubmedFileParser(FileParser): class PubmedFileParser(FileParser):
def _parse(self, file): def _parse(self, file):
# open the file as XML # open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True) xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
xml = etree.parse(file, parser=xml_parser)
xml = ""
if type(file)==bytes: xml = etree.parse( BytesIO(file) , parser=xml_parser)
else: xml = etree.parse(file, parser=xml_parser)
xml_articles = xml.findall('PubmedArticle') xml_articles = xml.findall('PubmedArticle')
# initialize the list of metadata
metadata_list = []
# parse all the articles, one by one # parse all the articles, one by one
for xml_article in xml_articles: for xml_article in xml_articles:
# extract data from the document # extract data from the document
...@@ -25,14 +17,9 @@ class PubmedFileParser(FileParser): ...@@ -25,14 +17,9 @@ class PubmedFileParser(FileParser):
metadata_path = { metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title', "journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle', "title" : 'MedlineCitation/Article/ArticleTitle',
# "abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"title2" : 'MedlineCitation/Article/VernacularTitle',
"language_iso3" : 'MedlineCitation/Article/Language', "language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]', "doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
"realdate_full_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate', "abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"realdate_year_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Year',
"realdate_month_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Month',
"realdate_day_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Day',
"publication_year" : 'MedlineCitation/DateCreated/Year', "publication_year" : 'MedlineCitation/DateCreated/Year',
"publication_month" : 'MedlineCitation/DateCreated/Month', "publication_month" : 'MedlineCitation/DateCreated/Month',
"publication_day" : 'MedlineCitation/DateCreated/Day', "publication_day" : 'MedlineCitation/DateCreated/Day',
...@@ -41,7 +28,6 @@ class PubmedFileParser(FileParser): ...@@ -41,7 +28,6 @@ class PubmedFileParser(FileParser):
for key, path in metadata_path.items(): for key, path in metadata_path.items():
try: try:
xml_node = xml_article.find(path) xml_node = xml_article.find(path)
# Authors tag
if key == 'authors': if key == 'authors':
metadata[key] = ', '.join([ metadata[key] = ', '.join([
xml_author.find('ForeName').text + ' ' + xml_author.find('LastName').text xml_author.find('ForeName').text + ' ' + xml_author.find('LastName').text
...@@ -49,61 +35,6 @@ class PubmedFileParser(FileParser): ...@@ -49,61 +35,6 @@ class PubmedFileParser(FileParser):
]) ])
else: else:
metadata[key] = xml_node.text metadata[key] = xml_node.text
except: except:
pass pass
yield metadata
#Title-Decision \ No newline at end of file
Title=""
if not metadata["title"] or metadata["title"]=="":
if "title2" in metadata:
metadata["title"] = metadata["title2"]
else: metadata["title"] = ""
# Date-Decision
# forge.iscpif.fr/issues/1418
RealDate = ""
if "realdate_full_" in metadata:
RealDate = metadata["realdate_full_"]
else:
if "realdate_year_" in metadata: RealDate+=metadata["realdate_year_"]
if "realdate_month_" in metadata: RealDate+=" "+metadata["realdate_month_"]
if "realdate_day_" in metadata: RealDate+=" "+metadata["realdate_day_"]
metadata["realdate_full_"] = RealDate
RealDate = RealDate.split("-")[0]
PubmedDate = ""
if "publication_year" in metadata: PubmedDate+=metadata["publication_year"]
if "publication_month" in metadata: PubmedDate+=" "+metadata["publication_month"]
if "publication_day" in metadata: PubmedDate+=" "+metadata["publication_day"]
Decision=""
if len(RealDate)>4:
if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()
except:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
else:
try: Decision = datetime.strptime(RealDate, '%Y %b').date()
except:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
else:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
if Decision!=False:
if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year)
if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month)
if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day)
if "realdate_year_" in metadata: metadata.pop("realdate_year_")
if "realdate_month_" in metadata: metadata.pop("realdate_month_")
if "realdate_day_" in metadata: metadata.pop("realdate_day_")
if "title2" in metadata: metadata.pop("title2")
# print(metadata)
# print("* * * * ** * * * * ")
metadata_list.append(metadata)
# return the list of metadata
return metadata_list
...@@ -17,42 +17,34 @@ class RisFileParser(FileParser): ...@@ -17,42 +17,34 @@ class RisFileParser(FileParser):
} }
def _parse(self, file): def _parse(self, file):
metadata_list = []
metadata = {} metadata = {}
last_key = None last_key = None
last_values = [] last_values = []
# browse every line of the file
for line in file: for line in file:
if len(line) > 2: if len(line) > 2:
# extract the parameter key
parameter_key = line[:2] parameter_key = line[:2]
# print(parameter_key)
if parameter_key != b' ' and parameter_key != last_key: if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters: if last_key in self._parameters:
# translate the parameter key
parameter = self._parameters[last_key] parameter = self._parameters[last_key]
if parameter["type"] == "metadata": if parameter["type"] == "metadata":
separator = parameter["separator"] if "separator" in parameter else "" separator = parameter["separator"] if "separator" in parameter else ""
metadata[parameter["key"]] = separator.join(last_values) metadata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter": elif parameter["type"] == "delimiter":
#language = self._languages_fullname[metadata["language"].lower()] if 'language_fullname' not in metadata.keys():
#print(metadata) if 'language_iso3' not in metadata.keys():
try: if 'language_iso2' not in metadata.keys():
#print("append") metadata['language_iso2'] = 'en'
if 'language_fullname' not in metadata.keys(): yield metadata
if 'language_iso3' not in metadata.keys(): metadata = {}
if 'language_iso2' not in metadata.keys():
metadata['language_iso2'] = 'en'
metadata_list.append(metadata)
metadata = {}
#print("append succeeded")
except:
pass
last_key = parameter_key last_key = parameter_key
last_values = [] last_values = []
try: try:
last_values.append(line[3:-1].decode()) last_values.append(line[3:-1].decode())
except Exception as error: except Exception as error:
print(error) print(error)
pass # if a metadata object is left in memory, yield it as well
#print(len(metadata_list)) if metadata:
#print(metadata_list) yield metadata
return metadata_list
...@@ -13,6 +13,7 @@ class NgramsExtractor: ...@@ -13,6 +13,7 @@ class NgramsExtractor:
self.start() self.start()
self._label = "NP" self._label = "NP"
self._rule = self._label + ": " + rule self._rule = self._label + ": " + rule
self._grammar = nltk.RegexpParser(self._rule)
def __del__(self): def __del__(self):
self.stop() self.stop()
...@@ -29,19 +30,8 @@ class NgramsExtractor: ...@@ -29,19 +30,8 @@ class NgramsExtractor:
""" """
def extract_ngrams(self, contents): def extract_ngrams(self, contents):
tagged_ngrams = self.tagger.tag_text(contents) tagged_ngrams = self.tagger.tag_text(contents)
if len(tagged_ngrams)==0: return [] if len(tagged_ngrams):
grammar_parsed = self._grammar.parse(tagged_ngrams)
grammar = nltk.RegexpParser(self._rule) for subtree in grammar_parsed.subtrees():
result = [] if subtree.label() == self._label:
# try: yield subtree.leaves()
grammar_parsed = grammar.parse(tagged_ngrams)
for subtree in grammar_parsed.subtrees():
if subtree.label() == self._label:
result.append(subtree.leaves())
# except Exception as e:
# print("Problem while parsing rule '%s'" % (self._rule, ))
# print(e)
return result
from .FrenchNgramsExtractor import FrenchNgramsExtractor from .FrenchNgramsExtractor import FrenchNgramsExtractor
from .TurboNgramsExtractor import TurboNgramsExtractor as EnglishNgramsExtractor from .TurboNgramsExtractor import TurboNgramsExtractor as EnglishNgramsExtractor
# from parsing.NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor # from .EnglishNgramsExtractor import EnglishNgramsExtractor
from .NgramsExtractor import NgramsExtractor from .NgramsExtractor import NgramsExtractor
...@@ -71,4 +71,3 @@ class Tagger: ...@@ -71,4 +71,3 @@ class Tagger:
tokens_tags += self.tag_tokens(tokens, False) tokens_tags += self.tag_tokens(tokens, False)
self.tagging_end() self.tagging_end()
return tokens_tags return tokens_tags
...@@ -9,15 +9,24 @@ from .settings import implemented_methods ...@@ -9,15 +9,24 @@ from .settings import implemented_methods
class NLPClient: class NLPClient:
def __init__(self): def __init__(self):
self._socket = socket.socket(*server_type_client) self._socket = None
self._socket.connect((server_host, server_port))
for method_name in dir(self): for method_name in dir(self):
if method_name[0] != '_': if method_name[0] != '_':
if method_name.upper() not in implemented_methods: if method_name.upper() not in implemented_methods:
setattr(self, method_name, self._notimplemented) setattr(self, method_name, self._notimplemented)
def __del__(self): def __del__(self):
self._socket.close() self._disconnect()
def _connect(self):
self._disconnect()
self._socket = socket.socket(*server_type_client)
self._socket.connect((server_host, server_port))
def _disconnect(self):
if self._socket is not None:
self._socket.close()
self._socket = None
def _notimplemented(self, *args, **kwargs): def _notimplemented(self, *args, **kwargs):
raise NotImplementedError( raise NotImplementedError(
...@@ -51,7 +60,7 @@ class NLPClient: ...@@ -51,7 +60,7 @@ class NLPClient:
data += language + '\n' data += language + '\n'
data += re.sub(r'\n+', '\n', text) data += re.sub(r'\n+', '\n', text)
data += '\n\n' data += '\n\n'
self.__init__() self._connect()
self._socket.sendall(data.encode()) self._socket.sendall(data.encode())
sentence = [] sentence = []
if keys is None: if keys is None:
...@@ -73,7 +82,6 @@ class NLPClient: ...@@ -73,7 +82,6 @@ class NLPClient:
continue continue
values = line.split('\t') values = line.split('\t')
sentence.append(dict(zip(keys, line.split('\t')))) sentence.append(dict(zip(keys, line.split('\t'))))
self.__del__()
def tokenize(self, text, language='english', asdict=False): def tokenize(self, text, language='english', asdict=False):
keys = ('token', ) if asdict else None keys = ('token', ) if asdict else None
......
...@@ -4,7 +4,7 @@ import socketserver ...@@ -4,7 +4,7 @@ import socketserver
# Server parameters # Server parameters
server_host = 'localhost' server_host = 'localhost'
server_port = 1234 server_port = 7777
server_type_server = socketserver.TCPServer server_type_server = socketserver.TCPServer
server_type_client = socket.AF_INET, socket.SOCK_STREAM server_type_client = socket.AF_INET, socket.SOCK_STREAM
server_timeout = 2.0 server_timeout = 2.0
......
from collections import defaultdict
from datetime import datetime
from random import random
from hashlib import md5
from time import time
from math import log
from gargantext_web.db import *
from .FileParsers import *
class DebugTime:
def __init__(self, prefix):
self.prefix = prefix
self.message = None
self.time = None
def __del__(self):
if self.message is not None and self.time is not None:
print('%s - %s: %.4f' % (self.prefix, self.message, time() - self.time))
def show(self, message):
self.__del__()
self.message = message
self.time = time()
# keep all the parsers in a cache
class Parsers(defaultdict):
_parsers = {
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
}
def __missing__(self, key):
if key not in self._parsers:
raise NotImplementedError('No such parser: "%s"' % (key))
parser = self._parsers[key]()
self[key] = parser
return parser
parsers = Parsers()
# resources managment
def add_resource(corpus, **kwargs):
# only for tests
session = Session()
resource = Resource(guid=str(random()), **kwargs )
# User
if 'user_id' not in kwargs:
resource.user_id = corpus.user_id
# Compute the digest
h = md5()
f = open(str(resource.file), 'rb')
h.update(f.read())
f.close()
resource.digest = h.hexdigest()
# check if a resource on this node already has this hash
tmp_resource = (session
.query(Resource)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
.filter(Resource.digest == resource.digest)
.filter(Node_Resource.node_id == corpus.id)
).first()
if tmp_resource is not None:
return tmp_resource
else:
session.add(resource)
session.commit()
# link with the resource
node_resource = Node_Resource(
node_id = corpus.id,
resource_id = resource.id,
parsed = False,
)
session.add(node_resource)
session.commit()
# return result
return resource
def parse_resources(corpus, user=None, user_id=None):
dbg = DebugTime('Corpus #%d - parsing' % corpus.id)
session = Session()
corpus_id = corpus.id
type_id = cache.NodeType['Document'].id
if user_id is None and user is not None:
user_id = user.id
else:
user_id = corpus.user_id
# find resource of the corpus
resources_query = (session
.query(Resource, ResourceType)
.join(ResourceType, ResourceType.id == Resource.type_id)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
.filter(Node_Resource.node_id == corpus.id)
.filter(Node_Resource.parsed == False)
)
# make a new node for every parsed document of the corpus
dbg.show('analyze documents')
nodes = list()
for resource, resourcetype in resources_query:
parser = parsers[resourcetype.name]
for metadata_dict in parser.parse(resource.file):
# retrieve language ID from metadata
if 'language_iso2' in metadata_dict:
try:
language_id = cache.Language[metadata_dict['language_iso2']].id
except KeyError:
language_id = None
else:
language_id = None
# create new node
node = Node(
name = metadata_dict.get('title', '')[:255],
parent_id = corpus_id,
user_id = user_id,
type_id = type_id,
language_id = language_id,
metadata = metadata_dict,
date = datetime.utcnow(),
)
nodes.append(node)
#
# TODO: mark node-resources associations as parsed
#
dbg.show('insert %d documents' % len(nodes))
session.add_all(nodes)
session.commit()
# now, index the metadata
dbg.show('insert metadata')
node_metadata_lists = defaultdict(list)
metadata_types = {
metadata.name: metadata
for metadata in session.query(Metadata)
}
for node in nodes:
node_id = node.id
for metadata_key, metadata_value in node.metadata.items():
try:
metadata = metadata_types[metadata_key]
except KeyError:
continue
if metadata.type == 'string':
metadata_value = metadata_value[:255]
node_metadata_lists[metadata.type].append((
node_id,
metadata.id,
metadata_value,
))
for key, values in node_metadata_lists.items():
bulk_insert(Node_Metadata, ['node_id', 'metadata_id', 'value_'+key], values)
# mark the corpus as parsed
corpus.parsed = True
# ngrams extraction
from .NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor, NgramsExtractor
class NgramsExtractors(defaultdict):
def __init__(self):
# English
self['en'] = EnglishNgramsExtractor()
for key in ('eng', 'english'):
self[key] = self['en']
# French
self['fr'] = FrenchNgramsExtractor()
for key in ('fre', 'french'):
self[key] = self['fr']
# default
self['default'] = NgramsExtractor()
def __missing__(self, key):
formatted_key = key.strip().lower()
if formatted_key in self:
self[key] = self[formatted_key]
else:
self[key] = self['default']
# raise NotImplementedError
return self[key]
ngramsextractors = NgramsExtractors()
def extract_ngrams(corpus, keys):
dbg = DebugTime('Corpus #%d - ngrams' % corpus.id)
default_language_iso2 = None if corpus.language_id is None else cache.Language[corpus.language_id].iso2
# query the metadata associated with the given keys
columns = [Node.id, Node.language_id] + [Node.metadata[key] for key in keys]
metadata_query = (session
.query(*columns)
.filter(Node.parent_id == corpus.id)
.filter(Node.type_id == cache.NodeType['Document'].id)
)
# prepare data to be inserted
dbg.show('find ngrams')
languages_by_id = {
language.id: language.iso2
for language in session.query(Language)
}
ngrams_data = set()
node_ngram_list = defaultdict(lambda: defaultdict(int))
for nodeinfo in metadata_query:
node_id = nodeinfo[0]
language_id = nodeinfo[1]
if language_id is None:
language_iso2 = default_language_iso2
else:
language_iso2 = languages_by_id.get(language_id, None)
if language_iso2 is None:
continue
ngramsextractor = ngramsextractors[language_iso2]
for text in nodeinfo[2:]:
if text is not None and len(text):
ngrams = ngramsextractor.extract_ngrams(text)
for ngram in ngrams:
terms = ' '.join([token for token, tag in ngram]).lower()
n = len(ngram)
node_ngram_list[node_id][terms] += 1
ngrams_data.add(
(n, terms)
)
# insert ngrams to temporary table
dbg.show('find ids for the %d ngrams' % len(ngrams_data))
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__ngrams (
id INT,
n INT NOT NULL,
terms VARCHAR(255) NOT NULL
)
''')
bulk_insert('tmp__ngrams', ['n', 'terms'], ngrams_data, cursor=cursor)
# retrieve ngram ids from already inserted stuff
cursor.execute('''
UPDATE
tmp__ngrams
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngrams.terms
''' % (Ngram.__table__.name, ))
# insert, then get the ids back
cursor.execute('''
INSERT INTO
%s (n, terms)
SELECT
n, terms
FROM
tmp__ngrams
WHERE
id IS NULL
''' % (Ngram.__table__.name, ))
cursor.execute('''
UPDATE
tmp__ngrams
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngrams.terms
AND
tmp__ngrams.id IS NULL
''' % (Ngram.__table__.name, ))
# get all ids
ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngrams')
for row in cursor.fetchall():
ngram_ids[row[1]] = row[0]
#
dbg.show('insert associations')
node_ngram_data = list()
for node_id, ngrams in node_ngram_list.items():
for terms, weight in ngrams.items():
ngram_id = ngram_ids[terms]
node_ngram_data.append((node_id, ngram_id, weight, ))
bulk_insert(Node_Ngram, ['node_id', 'ngram_id', 'weight'], node_ngram_data, cursor=cursor)
dbg.message = 'insert %d associations' % len(node_ngram_data)
# commit to database
db.commit()
# tfidf calculation
def compute_tfidf(corpus):
dbg = DebugTime('Corpus #%d - tfidf' % corpus.id)
# compute terms frequency sum
dbg.show('calculate terms frequencies sums')
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__st (
node_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__st (node_id, frequency)
SELECT
node_ngram.node_id,
SUM(node_ngram.weight) AS frequency
FROM
%s AS node
INNER JOIN
%s AS node_ngram ON node_ngram.node_id = node.id
WHERE
node.parent_id = %d
GROUP BY
node_ngram.node_id
''' % (Node.__table__.name, Node_Ngram.__table__.name, corpus.id, ))
# compute normalized terms frequencies
dbg.show('normalize terms frequencies')
cursor.execute('''
CREATE TEMPORARY TABLE tmp__tf (
node_id INT NOT NULL,
ngram_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__tf (node_id, ngram_id, frequency)
SELECT
node_ngram.node_id,
node_ngram.ngram_id,
(node_ngram.weight / node.frequency) AS frequency
FROM
%s AS node_ngram
INNER JOIN
tmp__st AS node ON node.node_id = node_ngram.node_id
''' % (Node_Ngram.__table__.name, ))
# show off
dbg.show('compute idf')
cursor.execute('''
CREATE TEMPORARY TABLE tmp__idf (
ngram_id INT NOT NULL,
idf DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__idf(ngram_id, idf)
SELECT
node_ngram.ngram_id,
-ln(COUNT(*))
FROM
%s AS node
INNER JOIN
%s AS node_ngram ON node_ngram.node_id = node.id
WHERE
node.parent_id = %d
GROUP BY
node_ngram.ngram_id
''' % (Node.__table__.name, Node_Ngram.__table__.name, corpus.id, ))
cursor.execute('SELECT COUNT(*) FROM tmp__st')
D = cursor.fetchone()[0]
lnD = log(D)
cursor.execute('UPDATE tmp__idf SET idf = idf + %f' % (lnD, ))
# show off
dbg.show('insert tfidf for %d documents' % D)
cursor.execute('''
INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score)
SELECT
%d AS nodex_id,
tf.node_id AS nodey_id,
tf.ngram_id AS ngram_id,
(tf.frequency * idf.idf) AS score
FROM
tmp__idf AS idf
INNER JOIN
tmp__tf AS tf ON tf.ngram_id = idf.ngram_id
''' % (NodeNodeNgram.__table__.name, corpus.id, ))
# # show off
# cursor.execute('''
# SELECT
# node.name,
# ngram.terms,
# node_node_ngram.score AS tfidf
# FROM
# %s AS node_node_ngram
# INNER JOIN
# %s AS node ON node.id = node_node_ngram.nodey_id
# INNER JOIN
# %s AS ngram ON ngram.id = node_node_ngram.ngram_id
# WHERE
# node_node_ngram.nodex_id = %d
# ORDER BY
# score DESC
# ''' % (NodeNodeNgram.__table__.name, Node.__table__.name, Ngram.__table__.name, corpus.id, ))
# for row in cursor.fetchall():
# print(row)
# the end!
db.commit()
# Without this, we couldn't use the Django environment
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext_web.settings")
os.environ.setdefault("DJANGO_HSTORE_GLOBAL_REGISTER", "False")
# database tools
from gargantext_web.db import *
from parsing.corpustools import *
user = session.query(User).first()
project = session.query(Node).filter(Node.name == 'A').first()
corpus = Node(
parent_id = project.id,
name = 'Test 456',
type_id = cache.NodeType['Corpus'].id,
user_id = user.id,
)
session.add(corpus)
session.commit()
add_resource(corpus,
# file = './data_samples/pubmed_result.xml',
file = './data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
type_id = cache.ResourceType['pubmed'].id,
)
parse_resources(corpus)
extract_ngrams(corpus, ('title', ))
# print(corpus)
# corpus = session.query(Node).filter(Node.id == 72771).first()
# corpus = session.query(Node).filter(Node.id == 73017).first()
compute_tfidf(corpus)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment