Commit 06a5ba47 authored by PkSM3's avatar PkSM3

[MERGE] maybe this works?

parents f9a08e53 b57ae7fe
......@@ -82,7 +82,7 @@ class NodesChildrenNgrams(APIView):
def get(self, request, node_id):
# query ngrams
ParentNode = aliased(Node)
ngrams_query = (Ngram
ngrams_query = (session
.query(Ngram.terms, func.count().label('count'))
# .query(Ngram.id, Ngram.terms, func.count().label('count'))
.join(Node_Ngram, Node_Ngram.ngram_id == Ngram.id)
......@@ -128,7 +128,7 @@ class NodesChildrenDuplicates(APIView):
raise APIException('Missing GET parameter: "keys"', 400)
keys = request.GET['keys'].split(',')
# metadata retrieval
metadata_query = (Metadata
metadata_query = (session
.query(Metadata)
.filter(Metadata.name.in_(keys))
)
......@@ -213,7 +213,7 @@ class NodesChildrenMetatadata(APIView):
# query metadata keys
ParentNode = aliased(Node)
metadata_query = (Metadata
metadata_query = (session
.query(Metadata)
.join(Node_Metadata, Node_Metadata.metadata_id == Metadata.id)
.join(Node, Node.id == Node_Metadata.node_id)
......@@ -233,7 +233,7 @@ class NodesChildrenMetatadata(APIView):
values_to = None
if metadata.type != 'text':
value_column = getattr(Node_Metadata, 'value_' + metadata.type)
node_metadata_query = (Node_Metadata
node_metadata_query = (session
.query(value_column)
.join(Node, Node.id == Node_Metadata.node_id)
.filter(Node.parent_id == node_id)
......@@ -381,9 +381,9 @@ class NodesChildrenQueries(APIView):
for field_name in fields_names:
split_field_name = field_name.split('.')
if split_field_name[0] == 'metadata':
metadata = Metadata.query(Metadata).filter(Metadata.name == split_field_name[1]).first()
metadata = session.query(Metadata).filter(Metadata.name == split_field_name[1]).first()
if metadata is None:
metadata_query = Metadata.query(Metadata.name).order_by(Metadata.name)
metadata_query = session.query(Metadata.name).order_by(Metadata.name)
metadata_names = [metadata.name for metadata in metadata_query.all()]
raise APIException('Invalid key for "%s" in parameter "field", should be one of the following values: "%s". "%s" was found instead' % (field[0], '", "'.join(metadata_names), field[1]), 400)
# check or create Node_Metadata alias; join if necessary
......@@ -422,7 +422,7 @@ class NodesChildrenQueries(APIView):
)
# starting the query!
document_type_id = NodeType.query(NodeType.id).filter(NodeType.name == 'Document').scalar()
document_type_id = session.query(NodeType.id).filter(NodeType.name == 'Document').scalar()
query = (session
.query(*fields_list)
.select_from(Node)
......@@ -451,9 +451,9 @@ class NodesChildrenQueries(APIView):
#
if field[0] == 'metadata':
# which metadata?
metadata = Metadata.query(Metadata).filter(Metadata.name == field[1]).first()
metadata = session.query(Metadata).filter(Metadata.name == field[1]).first()
if metadata is None:
metadata_query = Metadata.query(Metadata.name).order_by(Metadata.name)
metadata_query = session.query(Metadata.name).order_by(Metadata.name)
metadata_names = [metadata.name for metadata in metadata_query.all()]
raise APIException('Invalid key for "%s" in parameter "field", should be one of the following values: "%s". "%s" was found instead' % (field[0], '", "'.join(metadata_names), field[1]), 400)
# check or create Node_Metadata alias; join if necessary
......@@ -475,7 +475,7 @@ class NodesChildrenQueries(APIView):
))
elif field[0] == 'ngrams':
query = query.filter(
Node.id.in_(Node_Metadata
Node.id.in_(session
.query(Node_Ngram.node_id)
.filter(Node_Ngram.ngram_id == Ngram.id)
.filter(operator(
......@@ -551,7 +551,7 @@ class NodesChildrenQueries(APIView):
class NodesList(APIView):
def get(self, request):
query = (Node
query = (session
.query(Node.id, Node.name, NodeType.name.label('type'))
.filter(Node.user_id == request.session._session_cache['_auth_user_id'])
.join(NodeType)
......@@ -626,7 +626,7 @@ class CorpusController:
# build query
ParentNode = aliased(Node)
query = (Ngram
query = (session
.query(Ngram.terms, func.count('*'))
.join(Node_Ngram, Node_Ngram.ngram_id == Ngram.id)
.join(Node, Node.id == Node_Ngram.node_id)
......
from node import models
from gargantext_web import settings
from node import models
__all__ = ['literalquery', 'session', 'cache', 'Session', 'bulk_insert', 'engine', 'get_cursor']
__all__ = ['literalquery', 'session', 'cache']
# initialize sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}/{NAME}'.format(
**settings.DATABASES['default']
))
Base = automap_base()
Base.prepare(engine, reflect=True)
# model representation
def model_repr(modelname):
def _repr(obj):
result = '<' + modelname
isfirst = True
for key, value in obj.__dict__.items():
if key[0] != '_':
value = repr(value)
if len(value) > 64:
value = value[:30] + '....' + value[-30:]
if isfirst:
isfirst = False
else:
result += ','
result += ' ' + key + '=' + value
result += '>'
return result
return _repr
# map the Django models found in node.models to SQLAlchemy models
for model_name, model in models.__dict__.items():
if hasattr(model, 'sa'):
globals()[model_name] = model.sa
if hasattr(model, '_meta'):
table_name = model._meta.db_table
if hasattr(Base.classes, table_name):
sqla_model = getattr(Base.classes, table_name)
setattr(sqla_model, '__repr__', model_repr(model_name))
globals()[model_name] = sqla_model
__all__.append(model_name)
NodeNgram = Node_Ngram
......@@ -61,16 +97,17 @@ def literalquery(statement, dialect=None):
# SQLAlchemy session management
def get_sessionmaker():
from django.db import connections
from sqlalchemy.orm import sessionmaker
def get_engine():
from sqlalchemy import create_engine
alias = 'default'
connection = connections[alias]
url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}/{NAME}'.format(
**settings.DATABASES['default']
)
engine = create_engine(url, use_native_hstore=True)
return create_engine(url, use_native_hstore=True)
engine = get_engine()
def get_sessionmaker():
from sqlalchemy.orm import sessionmaker
return sessionmaker(bind=engine)
Session = get_sessionmaker()
......@@ -84,7 +121,7 @@ from sqlalchemy import or_
class ModelCache(dict):
def __init__(self, model, preload=False):
self._model = model.sa
self._model = globals()[model.__name__]
self._columns_names = [column.name for column in model._meta.fields if column.unique]
self._columns = [getattr(self._model, column_name) for column_name in self._columns_names]
self._columns_validators = []
......@@ -92,16 +129,13 @@ class ModelCache(dict):
self.preload()
def __missing__(self, key):
for column in self._columns:
conditions = []
try:
formatted_key = column.type.python_type(key)
conditions.append(column == key)
except ValueError:
pass
if formatted_key in self:
self[key] = self[formatted_key]
else:
conditions = [
(column == key)
for column in self._columns
if key.__class__ == column.type.python_type
]
if len(conditions) == 0:
raise KeyError
element = session.query(self._model).filter(or_(*conditions)).first()
if element is None:
raise KeyError
......@@ -127,3 +161,48 @@ class Cache:
return modelcache
cache = Cache()
# Insert many elements at once
import psycopg2
def get_cursor():
db_settings = settings.DATABASES['default']
db = psycopg2.connect(**{
'database': db_settings['NAME'],
'user': db_settings['USER'],
'password': db_settings['PASSWORD'],
'host': db_settings['HOST'],
})
return db, db.cursor()
class bulk_insert:
def __init__(self, table, keys, data, cursor=None):
# prepare the iterator
self.iter = iter(data)
# template
self.template = '%s' + (len(keys) - 1) * '\t%s' + '\n'
# prepare the cursor
if cursor is None:
db, cursor = get_cursor()
mustcommit = True
else:
mustcommit = False
# insert data
if not isinstance(table, str):
table = table.__table__.name
cursor.copy_from(self, table, columns=keys)
# commit if necessary
if mustcommit:
db.commit()
def read(self, size=None):
try:
return self.template % next(self.iter)
except StopIteration:
return ''
readline = read
......@@ -3,7 +3,7 @@ from django.conf.urls import patterns, include, url
from django.contrib import admin
from django.contrib.auth.views import login
from gargantext_web import views
from gargantext_web import views, views_optimized
import gargantext_web.api
import scrap_pubmed.views as pubmedscrapper
......@@ -31,7 +31,7 @@ urlpatterns = patterns('',
# Project Management
url(r'^projects/$', views.projects),
url(r'^project/(\d+)/delete/$', views.delete_project),
url(r'^project/(\d+)/$', views.project),
url(r'^project/(\d+)/$', views_optimized.project),
# Corpus management
url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus),
......
from django.shortcuts import redirect
from django.shortcuts import render
from django.http import Http404, HttpResponse, HttpResponseRedirect
from sqlalchemy import func
from sqlalchemy.orm import aliased
from collections import defaultdict
from datetime import datetime
from node.admin import CustomForm
from gargantext_web.db import *
from gargantext_web.settings import DEBUG, MEDIA_ROOT
from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
def project(request, project_id):
# SQLAlchemy session
session = Session()
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.type_id == cache.NodeType['Project'].id)
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
# Let's find out about the children nodes of the project
ChildrenNode = aliased(Node)
corpus_query = (session
.query(Node, Resource, func.count(ChildrenNode.id))
.outerjoin(ChildrenNode, ChildrenNode.parent_id == Node.id)
.outerjoin(Node_Resource, Node_Resource.node_id == Node.id)
.outerjoin(Resource, Resource.id == Node_Resource.resource_id)
.filter(Node.parent_id == project.id)
.group_by(Node, Resource)
.order_by(Node.name)
)
corpora_by_resourcetype = defaultdict(list)
documents_count_by_resourcetype = defaultdict(int)
corpora_count = 0
for corpus, resource, document_count in corpus_query:
if resource is None:
resourcetype_name = '(no resource)'
else:
resourcetype = cache.ResourceType[resource.type_id]
resourcetype_name = resourcetype.name
corpora_by_resourcetype[resourcetype_name].append({
'id': corpus.id,
'name': corpus.name,
'count': document_count,
})
documents_count_by_resourcetype[resourcetype_name] += document_count
corpora_count += 1
# do the donut
total_documents_count = sum(documents_count_by_resourcetype.values())
donut = [
{ 'source': key,
'count': value,
'part' : round(value * 100 / total_documents_count) if total_documents_count else 0,
}
for key, value in documents_count_by_resourcetype.items()
]
# deal with the form
if request.method == 'POST':
# fomr validation
form = CustomForm(request.POST, request.FILES)
if form.is_valid():
# extract information from the form
name = form.cleaned_data['name']
thefile = form.cleaned_data['file']
resourcetype = cache.ResourceType[form.cleaned_data['type']]
# which default language shall be used?
if resourcetype.name == "europress_french":
language_id = cache.Language['fr'].id
elif resourcetype.name == "europress_english":
language_id = cache.Language['en'].id
else:
language_id = None
# corpus node instanciation as a Django model
corpus = Node(
name = name,
user_id = request.user.id,
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = language_id,
)
session.add(corpus)
session.commit()
# save the uploaded file
filepath = '%s/corpora/%s/%s' % (MEDIA_ROOT, request.user.username, thefile._name)
f = open(filepath, 'wb')
f.write(thefile.read())
f.close()
# add the uploaded resource to the corpus
add_resource(corpus,
user_id = request.user.id,
type_id = resourcetype.id,
file = filepath,
)
# let's start the workflow
try:
parse_resources(corpus)
extract_ngrams(corpus, ['title'])
compute_tfidf(corpus)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
# redirect to the main project page
return HttpResponseRedirect('/project/' + str(project_id))
else:
print('ERROR: BAD FORM')
else:
form = CustomForm()
# HTML output
return render(request, 'project.html', {
'form' : form,
'user' : user,
'date' : datetime.now(),
'project' : project,
'donut' : donut,
'list_corpora' : dict(corpora_by_resourcetype),
'whitelists' : '',
'blacklists' : '',
'cooclists' : '',
'number' : corpora_count,
})
......@@ -70,7 +70,7 @@ class Resource(models.Model):
return self.file
class NodeType(models.Model):
name = models.CharField(max_length=200, unique=True)
name = models.CharField(max_length=255, unique=True)
def __str__(self):
return self.name
......@@ -111,7 +111,7 @@ class NodeManager(CTENodeManager):
return getattr(self.get_queryset(), name, *args)
class Metadata(models.Model):
name = models.CharField(max_length=32, db_index=True)
name = models.CharField(max_length=32, unique=True)
type = models.CharField(max_length=16, db_index=True)
class Node(CTENode):
......@@ -120,7 +120,7 @@ class Node(CTENode):
user = models.ForeignKey(User)
type = models.ForeignKey(NodeType)
name = models.CharField(max_length=200)
name = models.CharField(max_length=255)
language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL)
......@@ -189,7 +189,7 @@ class Node(CTENode):
for i, metadata_values in enumerate(metadata_list):
if verbose:
print(i, end='\r', flush=True)
name = metadata_values.get('title', '')[:200]
name = metadata_values.get('title', '')[:255]
language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None,
if isinstance(language, tuple):
language = language[0]
......
......@@ -48,11 +48,9 @@ class EuropressFileParser(FileParser):
print(error)
except:
return []
except Exception as error:
print(error)
# initialize the list of metadata
metadata_list = []
# parse all the articles, one by one
try:
for html_article in html_articles:
......@@ -201,16 +199,9 @@ class EuropressFileParser(FileParser):
#metadata_str = {}
for key, value in metadata.items():
metadata[key] = value.decode() if isinstance(value, bytes) else value
metadata_list.append(metadata)
yield metadata
count += 1
except Exception as error:
print(error)
pass
# from pprint import pprint
# pprint(metadata_list)
# return []
return metadata_list
......@@ -103,15 +103,21 @@ class FileParser:
zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist():
try:
metadata_list += self.parse(zipArchive.open(filename, "r"))
f = zipArchive.open(filename, 'r')
metadata_list += self.parse(f)
f.close()
except Exception as error:
print(error)
# ...otherwise, let's parse it directly!
else:
try:
metadata_list += self._parse(file)
for metadata in self._parse(file):
metadata_list.append(self.format_metadata(metadata))
if hasattr(file, 'close'):
file.close()
except Exception as error:
print(error)
# return the list of formatted metadata
return map(self.format_metadata, metadata_list)
return metadata_list
......@@ -2,22 +2,14 @@ from django.db import transaction
from lxml import etree
from .FileParser import FileParser
from ..NgramsExtractors import *
from datetime import datetime
from io import BytesIO
class PubmedFileParser(FileParser):
def _parse(self, file):
# open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
xml = ""
if type(file)==bytes: xml = etree.parse( BytesIO(file) , parser=xml_parser)
else: xml = etree.parse(file, parser=xml_parser)
xml = etree.parse(file, parser=xml_parser)
xml_articles = xml.findall('PubmedArticle')
# initialize the list of metadata
metadata_list = []
# parse all the articles, one by one
for xml_article in xml_articles:
# extract data from the document
......@@ -25,14 +17,9 @@ class PubmedFileParser(FileParser):
metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
# "abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"title2" : 'MedlineCitation/Article/VernacularTitle',
"language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
"realdate_full_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate',
"realdate_year_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Year',
"realdate_month_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Month',
"realdate_day_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Day',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"publication_year" : 'MedlineCitation/DateCreated/Year',
"publication_month" : 'MedlineCitation/DateCreated/Month',
"publication_day" : 'MedlineCitation/DateCreated/Day',
......@@ -41,7 +28,6 @@ class PubmedFileParser(FileParser):
for key, path in metadata_path.items():
try:
xml_node = xml_article.find(path)
# Authors tag
if key == 'authors':
metadata[key] = ', '.join([
xml_author.find('ForeName').text + ' ' + xml_author.find('LastName').text
......@@ -49,61 +35,6 @@ class PubmedFileParser(FileParser):
])
else:
metadata[key] = xml_node.text
except:
pass
#Title-Decision
Title=""
if not metadata["title"] or metadata["title"]=="":
if "title2" in metadata:
metadata["title"] = metadata["title2"]
else: metadata["title"] = ""
# Date-Decision
# forge.iscpif.fr/issues/1418
RealDate = ""
if "realdate_full_" in metadata:
RealDate = metadata["realdate_full_"]
else:
if "realdate_year_" in metadata: RealDate+=metadata["realdate_year_"]
if "realdate_month_" in metadata: RealDate+=" "+metadata["realdate_month_"]
if "realdate_day_" in metadata: RealDate+=" "+metadata["realdate_day_"]
metadata["realdate_full_"] = RealDate
RealDate = RealDate.split("-")[0]
PubmedDate = ""
if "publication_year" in metadata: PubmedDate+=metadata["publication_year"]
if "publication_month" in metadata: PubmedDate+=" "+metadata["publication_month"]
if "publication_day" in metadata: PubmedDate+=" "+metadata["publication_day"]
Decision=""
if len(RealDate)>4:
if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()
except:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
else:
try: Decision = datetime.strptime(RealDate, '%Y %b').date()
except:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
else:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
if Decision!=False:
if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year)
if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month)
if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day)
if "realdate_year_" in metadata: metadata.pop("realdate_year_")
if "realdate_month_" in metadata: metadata.pop("realdate_month_")
if "realdate_day_" in metadata: metadata.pop("realdate_day_")
if "title2" in metadata: metadata.pop("title2")
# print(metadata)
# print("* * * * ** * * * * ")
metadata_list.append(metadata)
# return the list of metadata
return metadata_list
yield metadata
\ No newline at end of file
......@@ -17,42 +17,34 @@ class RisFileParser(FileParser):
}
def _parse(self, file):
metadata_list = []
metadata = {}
last_key = None
last_values = []
# browse every line of the file
for line in file:
if len(line) > 2:
# extract the parameter key
parameter_key = line[:2]
# print(parameter_key)
if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters:
# translate the parameter key
parameter = self._parameters[last_key]
if parameter["type"] == "metadata":
separator = parameter["separator"] if "separator" in parameter else ""
metadata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter":
#language = self._languages_fullname[metadata["language"].lower()]
#print(metadata)
try:
#print("append")
if 'language_fullname' not in metadata.keys():
if 'language_iso3' not in metadata.keys():
if 'language_iso2' not in metadata.keys():
metadata['language_iso2'] = 'en'
metadata_list.append(metadata)
yield metadata
metadata = {}
#print("append succeeded")
except:
pass
last_key = parameter_key
last_values = []
try:
last_values.append(line[3:-1].decode())
except Exception as error:
print(error)
pass
#print(len(metadata_list))
#print(metadata_list)
return metadata_list
# if a metadata object is left in memory, yield it as well
if metadata:
yield metadata
......@@ -13,6 +13,7 @@ class NgramsExtractor:
self.start()
self._label = "NP"
self._rule = self._label + ": " + rule
self._grammar = nltk.RegexpParser(self._rule)
def __del__(self):
self.stop()
......@@ -29,19 +30,8 @@ class NgramsExtractor:
"""
def extract_ngrams(self, contents):
tagged_ngrams = self.tagger.tag_text(contents)
if len(tagged_ngrams)==0: return []
grammar = nltk.RegexpParser(self._rule)
result = []
# try:
grammar_parsed = grammar.parse(tagged_ngrams)
if len(tagged_ngrams):
grammar_parsed = self._grammar.parse(tagged_ngrams)
for subtree in grammar_parsed.subtrees():
if subtree.label() == self._label:
result.append(subtree.leaves())
# except Exception as e:
# print("Problem while parsing rule '%s'" % (self._rule, ))
# print(e)
return result
yield subtree.leaves()
from .FrenchNgramsExtractor import FrenchNgramsExtractor
from .TurboNgramsExtractor import TurboNgramsExtractor as EnglishNgramsExtractor
# from parsing.NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
# from .EnglishNgramsExtractor import EnglishNgramsExtractor
from .NgramsExtractor import NgramsExtractor
......@@ -71,4 +71,3 @@ class Tagger:
tokens_tags += self.tag_tokens(tokens, False)
self.tagging_end()
return tokens_tags
......@@ -9,15 +9,24 @@ from .settings import implemented_methods
class NLPClient:
def __init__(self):
self._socket = socket.socket(*server_type_client)
self._socket.connect((server_host, server_port))
self._socket = None
for method_name in dir(self):
if method_name[0] != '_':
if method_name.upper() not in implemented_methods:
setattr(self, method_name, self._notimplemented)
def __del__(self):
self._disconnect()
def _connect(self):
self._disconnect()
self._socket = socket.socket(*server_type_client)
self._socket.connect((server_host, server_port))
def _disconnect(self):
if self._socket is not None:
self._socket.close()
self._socket = None
def _notimplemented(self, *args, **kwargs):
raise NotImplementedError(
......@@ -51,7 +60,7 @@ class NLPClient:
data += language + '\n'
data += re.sub(r'\n+', '\n', text)
data += '\n\n'
self.__init__()
self._connect()
self._socket.sendall(data.encode())
sentence = []
if keys is None:
......@@ -73,7 +82,6 @@ class NLPClient:
continue
values = line.split('\t')
sentence.append(dict(zip(keys, line.split('\t'))))
self.__del__()
def tokenize(self, text, language='english', asdict=False):
keys = ('token', ) if asdict else None
......
......@@ -4,7 +4,7 @@ import socketserver
# Server parameters
server_host = 'localhost'
server_port = 1234
server_port = 7777
server_type_server = socketserver.TCPServer
server_type_client = socket.AF_INET, socket.SOCK_STREAM
server_timeout = 2.0
......
from collections import defaultdict
from datetime import datetime
from random import random
from hashlib import md5
from time import time
from math import log
from gargantext_web.db import *
from .FileParsers import *
class DebugTime:
def __init__(self, prefix):
self.prefix = prefix
self.message = None
self.time = None
def __del__(self):
if self.message is not None and self.time is not None:
print('%s - %s: %.4f' % (self.prefix, self.message, time() - self.time))
def show(self, message):
self.__del__()
self.message = message
self.time = time()
# keep all the parsers in a cache
class Parsers(defaultdict):
_parsers = {
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
}
def __missing__(self, key):
if key not in self._parsers:
raise NotImplementedError('No such parser: "%s"' % (key))
parser = self._parsers[key]()
self[key] = parser
return parser
parsers = Parsers()
# resources managment
def add_resource(corpus, **kwargs):
# only for tests
session = Session()
resource = Resource(guid=str(random()), **kwargs )
# User
if 'user_id' not in kwargs:
resource.user_id = corpus.user_id
# Compute the digest
h = md5()
f = open(str(resource.file), 'rb')
h.update(f.read())
f.close()
resource.digest = h.hexdigest()
# check if a resource on this node already has this hash
tmp_resource = (session
.query(Resource)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
.filter(Resource.digest == resource.digest)
.filter(Node_Resource.node_id == corpus.id)
).first()
if tmp_resource is not None:
return tmp_resource
else:
session.add(resource)
session.commit()
# link with the resource
node_resource = Node_Resource(
node_id = corpus.id,
resource_id = resource.id,
parsed = False,
)
session.add(node_resource)
session.commit()
# return result
return resource
def parse_resources(corpus, user=None, user_id=None):
dbg = DebugTime('Corpus #%d - parsing' % corpus.id)
session = Session()
corpus_id = corpus.id
type_id = cache.NodeType['Document'].id
if user_id is None and user is not None:
user_id = user.id
else:
user_id = corpus.user_id
# find resource of the corpus
resources_query = (session
.query(Resource, ResourceType)
.join(ResourceType, ResourceType.id == Resource.type_id)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
.filter(Node_Resource.node_id == corpus.id)
.filter(Node_Resource.parsed == False)
)
# make a new node for every parsed document of the corpus
dbg.show('analyze documents')
nodes = list()
for resource, resourcetype in resources_query:
parser = parsers[resourcetype.name]
for metadata_dict in parser.parse(resource.file):
# retrieve language ID from metadata
if 'language_iso2' in metadata_dict:
try:
language_id = cache.Language[metadata_dict['language_iso2']].id
except KeyError:
language_id = None
else:
language_id = None
# create new node
node = Node(
name = metadata_dict.get('title', '')[:255],
parent_id = corpus_id,
user_id = user_id,
type_id = type_id,
language_id = language_id,
metadata = metadata_dict,
date = datetime.utcnow(),
)
nodes.append(node)
#
# TODO: mark node-resources associations as parsed
#
dbg.show('insert %d documents' % len(nodes))
session.add_all(nodes)
session.commit()
# now, index the metadata
dbg.show('insert metadata')
node_metadata_lists = defaultdict(list)
metadata_types = {
metadata.name: metadata
for metadata in session.query(Metadata)
}
for node in nodes:
node_id = node.id
for metadata_key, metadata_value in node.metadata.items():
try:
metadata = metadata_types[metadata_key]
except KeyError:
continue
if metadata.type == 'string':
metadata_value = metadata_value[:255]
node_metadata_lists[metadata.type].append((
node_id,
metadata.id,
metadata_value,
))
for key, values in node_metadata_lists.items():
bulk_insert(Node_Metadata, ['node_id', 'metadata_id', 'value_'+key], values)
# mark the corpus as parsed
corpus.parsed = True
# ngrams extraction
from .NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor, NgramsExtractor
class NgramsExtractors(defaultdict):
def __init__(self):
# English
self['en'] = EnglishNgramsExtractor()
for key in ('eng', 'english'):
self[key] = self['en']
# French
self['fr'] = FrenchNgramsExtractor()
for key in ('fre', 'french'):
self[key] = self['fr']
# default
self['default'] = NgramsExtractor()
def __missing__(self, key):
formatted_key = key.strip().lower()
if formatted_key in self:
self[key] = self[formatted_key]
else:
self[key] = self['default']
# raise NotImplementedError
return self[key]
ngramsextractors = NgramsExtractors()
def extract_ngrams(corpus, keys):
dbg = DebugTime('Corpus #%d - ngrams' % corpus.id)
default_language_iso2 = None if corpus.language_id is None else cache.Language[corpus.language_id].iso2
# query the metadata associated with the given keys
columns = [Node.id, Node.language_id] + [Node.metadata[key] for key in keys]
metadata_query = (session
.query(*columns)
.filter(Node.parent_id == corpus.id)
.filter(Node.type_id == cache.NodeType['Document'].id)
)
# prepare data to be inserted
dbg.show('find ngrams')
languages_by_id = {
language.id: language.iso2
for language in session.query(Language)
}
ngrams_data = set()
node_ngram_list = defaultdict(lambda: defaultdict(int))
for nodeinfo in metadata_query:
node_id = nodeinfo[0]
language_id = nodeinfo[1]
if language_id is None:
language_iso2 = default_language_iso2
else:
language_iso2 = languages_by_id.get(language_id, None)
if language_iso2 is None:
continue
ngramsextractor = ngramsextractors[language_iso2]
for text in nodeinfo[2:]:
if text is not None and len(text):
ngrams = ngramsextractor.extract_ngrams(text)
for ngram in ngrams:
terms = ' '.join([token for token, tag in ngram]).lower()
n = len(ngram)
node_ngram_list[node_id][terms] += 1
ngrams_data.add(
(n, terms)
)
# insert ngrams to temporary table
dbg.show('find ids for the %d ngrams' % len(ngrams_data))
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__ngrams (
id INT,
n INT NOT NULL,
terms VARCHAR(255) NOT NULL
)
''')
bulk_insert('tmp__ngrams', ['n', 'terms'], ngrams_data, cursor=cursor)
# retrieve ngram ids from already inserted stuff
cursor.execute('''
UPDATE
tmp__ngrams
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngrams.terms
''' % (Ngram.__table__.name, ))
# insert, then get the ids back
cursor.execute('''
INSERT INTO
%s (n, terms)
SELECT
n, terms
FROM
tmp__ngrams
WHERE
id IS NULL
''' % (Ngram.__table__.name, ))
cursor.execute('''
UPDATE
tmp__ngrams
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngrams.terms
AND
tmp__ngrams.id IS NULL
''' % (Ngram.__table__.name, ))
# get all ids
ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngrams')
for row in cursor.fetchall():
ngram_ids[row[1]] = row[0]
#
dbg.show('insert associations')
node_ngram_data = list()
for node_id, ngrams in node_ngram_list.items():
for terms, weight in ngrams.items():
ngram_id = ngram_ids[terms]
node_ngram_data.append((node_id, ngram_id, weight, ))
bulk_insert(Node_Ngram, ['node_id', 'ngram_id', 'weight'], node_ngram_data, cursor=cursor)
dbg.message = 'insert %d associations' % len(node_ngram_data)
# commit to database
db.commit()
# tfidf calculation
def compute_tfidf(corpus):
dbg = DebugTime('Corpus #%d - tfidf' % corpus.id)
# compute terms frequency sum
dbg.show('calculate terms frequencies sums')
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__st (
node_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__st (node_id, frequency)
SELECT
node_ngram.node_id,
SUM(node_ngram.weight) AS frequency
FROM
%s AS node
INNER JOIN
%s AS node_ngram ON node_ngram.node_id = node.id
WHERE
node.parent_id = %d
GROUP BY
node_ngram.node_id
''' % (Node.__table__.name, Node_Ngram.__table__.name, corpus.id, ))
# compute normalized terms frequencies
dbg.show('normalize terms frequencies')
cursor.execute('''
CREATE TEMPORARY TABLE tmp__tf (
node_id INT NOT NULL,
ngram_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__tf (node_id, ngram_id, frequency)
SELECT
node_ngram.node_id,
node_ngram.ngram_id,
(node_ngram.weight / node.frequency) AS frequency
FROM
%s AS node_ngram
INNER JOIN
tmp__st AS node ON node.node_id = node_ngram.node_id
''' % (Node_Ngram.__table__.name, ))
# show off
dbg.show('compute idf')
cursor.execute('''
CREATE TEMPORARY TABLE tmp__idf (
ngram_id INT NOT NULL,
idf DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__idf(ngram_id, idf)
SELECT
node_ngram.ngram_id,
-ln(COUNT(*))
FROM
%s AS node
INNER JOIN
%s AS node_ngram ON node_ngram.node_id = node.id
WHERE
node.parent_id = %d
GROUP BY
node_ngram.ngram_id
''' % (Node.__table__.name, Node_Ngram.__table__.name, corpus.id, ))
cursor.execute('SELECT COUNT(*) FROM tmp__st')
D = cursor.fetchone()[0]
lnD = log(D)
cursor.execute('UPDATE tmp__idf SET idf = idf + %f' % (lnD, ))
# show off
dbg.show('insert tfidf for %d documents' % D)
cursor.execute('''
INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score)
SELECT
%d AS nodex_id,
tf.node_id AS nodey_id,
tf.ngram_id AS ngram_id,
(tf.frequency * idf.idf) AS score
FROM
tmp__idf AS idf
INNER JOIN
tmp__tf AS tf ON tf.ngram_id = idf.ngram_id
''' % (NodeNodeNgram.__table__.name, corpus.id, ))
# # show off
# cursor.execute('''
# SELECT
# node.name,
# ngram.terms,
# node_node_ngram.score AS tfidf
# FROM
# %s AS node_node_ngram
# INNER JOIN
# %s AS node ON node.id = node_node_ngram.nodey_id
# INNER JOIN
# %s AS ngram ON ngram.id = node_node_ngram.ngram_id
# WHERE
# node_node_ngram.nodex_id = %d
# ORDER BY
# score DESC
# ''' % (NodeNodeNgram.__table__.name, Node.__table__.name, Ngram.__table__.name, corpus.id, ))
# for row in cursor.fetchall():
# print(row)
# the end!
db.commit()
# Without this, we couldn't use the Django environment
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext_web.settings")
os.environ.setdefault("DJANGO_HSTORE_GLOBAL_REGISTER", "False")
# database tools
from gargantext_web.db import *
from parsing.corpustools import *
user = session.query(User).first()
project = session.query(Node).filter(Node.name == 'A').first()
corpus = Node(
parent_id = project.id,
name = 'Test 456',
type_id = cache.NodeType['Corpus'].id,
user_id = user.id,
)
session.add(corpus)
session.commit()
add_resource(corpus,
# file = './data_samples/pubmed_result.xml',
file = './data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
type_id = cache.ResourceType['pubmed'].id,
)
parse_resources(corpus)
extract_ngrams(corpus, ('title', ))
# print(corpus)
# corpus = session.query(Node).filter(Node.id == 72771).first()
# corpus = session.query(Node).filter(Node.id == 73017).first()
compute_tfidf(corpus)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment