Commit 49286b9a authored by Mathieu Rodic's avatar Mathieu Rodic

[OPTI] Started optimizing parsing

https://forge.iscpif.fr/issues/1438
parent 76c1a3dd
from node import models
from gargantext_web import settings
from node import models
__all__ = ['literalquery', 'session', 'cache', 'Session']
......
......@@ -12,6 +12,7 @@ from node.admin import CustomForm
from gargantext_web.db import *
from gargantext_web.settings import DEBUG
from parsing.corpus import parse_resources
def project(request, project_id):
......@@ -49,6 +50,7 @@ def project(request, project_id):
.join(Resource, Resource.id == Node_Resource.resource_id)
.filter(Node.parent_id == project.id)
.group_by(Node, Resource)
.order_by(Node.name)
)
corpora_by_resourcetype = defaultdict(list)
documents_count_by_resourcetype = defaultdict(int)
......@@ -69,7 +71,7 @@ def project(request, project_id):
donut = [
{ 'source': key,
'count': value,
'part' : round(value * 100 / total_documents_count),
'part' : round(value * 100 / total_documents_count) if total_documents_count else 0,
}
for key, value in documents_count_by_resourcetype.items()
]
......@@ -108,10 +110,11 @@ def project(request, project_id):
)
# let's start the workflow
try:
if DEBUG is True:
dj_corpus.workflow()
else:
dj_corpus.workflow.apply_async((), countdown=3)
parse_resources(dj_corpus, user_id=request.user.id)
# if DEBUG is True:
# dj_corpus.workflow()
# else:
# dj_corpus.workflow.apply_async((), countdown=3)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
......
from collections import defaultdict
from datetime import datetime
from gargantext_web.db import *
from .FileParsers import *
_parsers = {
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
}
# keep all the parsers in a cache
class Parsers(defaultdict):
def parse_corpus_resources(corpus, user=None, user_id=None):
_parsers = {
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
}
def __missing__(self, key):
if key not in self._parsers:
raise NotImplementedError('No such parser: "%s"' % (key))
parser = self._parsers[key]()
self[key] = parser
return parser
parsers = Parsers()
def parse_resources(corpus, user=None, user_id=None):
session = Session()
type_id = cache.NodeType['Document']
corpus_id = corpus.id
type_id = cache.NodeType['Document'].id
if user_id is None and user is not None:
user_id = user.id
# keep all the parsers in a cache
parsers = defaultdict(lambda key: _parsers[key]())
# find resource of the corpus
resources_query = (session
.query(Resource, ResourceType)
.join(ResourceType, ResourceType.id == Resource.type_id)
.join(Node_Resource, Node_Resource.resource_id == Resource)
.join(Node, Node.id == Node_Resource.node_id)
.filter(Node.parent_id == corpus.id)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
.filter(Node_Resource.node_id == corpus.id)
.filter(Node_Resource.parsed == False)
)
# make a new node for every parsed document of the corpus
nodes = list()
for resource, resourcetype in resources_query:
parser = parsers[resourcetype.name]
for metadata_dict in resource:
for metadata_dict in parser.parse(resource.file):
# retrieve language ID from metadata
if 'language_iso2' in metadata_dict:
try:
language_id = cache.Langage[metadata_dict['language_iso2']]
language_id = cache.Language[metadata_dict['language_iso2']].id
except KeyError:
language_id = None
else:
language_id = None
# create new node
node = Node(
name = metadata.get('title', ''),
parent_id = corpus.id,
user_id = user_id,
type_id = type_id,
language_id = language_id,
metadata = metadata_dict,
)
node = Node()
node.name = metadata_dict.get('title', '')
node.parent_id = corpus_id
node.user_id = user_id
node.type_id = type_id
node.language_id = language_id
node.metadata = metadata_dict
node.date = datetime.utcnow()
nodes.append(node)
session.add_bulk(nodes)
#
# TODO: mark node-resources associations as parsed
#
session.add_all(nodes)
session.commit()
# now, index the metadata
for node in nodes:
......@@ -62,18 +79,17 @@ def parse_corpus_resources(corpus, user=None, user_id=None):
metadata = cache.Metadata[key]
if metadata.type == 'string':
metadata_value = metadata_value[:255]
node_metadata = Node_Metadata(**{
'node_id': node_id,
'metadata_id': metadata.id,
'value_'+metadata.type: value,
})
node_metadata = Node_Metadata()
node_metadata.node_id = node_id
node_metadata.metadata_id = metadata.id
setattr(node_metadata, 'value_'+metadata.type, metadata_value)
session.add(node_metadata)
session.commit()
# mark the corpus as parsed
corpus.parsed = True
def parse_corpus(corpus):
def extract_ngrams(corpus):
# prepare the cache for ngrams
from nodes import models
ngrams = ModelCache(models.Node)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment