Commit f6122f1c authored by Mathieu Rodic's avatar Mathieu Rodic

[FEATURE] file parsers - separate parsing from extraction

Work in progress...
parent 86bbf12a
{
"metadata": {
"name": "",
"signature": "sha256:e0c3b2efe7c205a29dc4e028b10ffb7b9d0569f35c4b426febdf523069abffdb"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pprint import pprint\n",
"from node.models import Node, NodeType, Language, Ngram\n",
"from django.contrib.auth.models import User\n",
"import parsing\n",
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Define user\n",
"try:\n",
" user = User.objects.get(username='Mat')\n",
"except:\n",
" user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
" user.save()\n",
"\n",
"# Define document types\n",
"nodetypes = {}\n",
"for name in ['Corpus', 'Document']:\n",
" try:\n",
" nodetypes[name] = NodeType.objects.get(name=name)\n",
" except:\n",
" nodetypes[name] = NodeType(name=name)\n",
" nodetypes[name].save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()\n",
"corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
"corpus.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser.parse(corpus)\n",
"print('Ok!')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
}
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for node_ngram in corpus.children.first().node_ngram_set.all():\n",
" print(node_ngram.ngram.terms)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
\ No newline at end of file
{ {
"metadata": { "metadata": {
"name": "", "name": "",
"signature": "sha256:eac7c9b22e240bb0ef6d0aeec21261194d84a3f0ba53cd02af69f80d30ec5a17" "signature": "sha256:70c2c8a4c8089e61195ee9da9232043152cf5e6c658a32115c0dcf990c2e98af"
}, },
"nbformat": 3, "nbformat": 3,
"nbformat_minor": 0, "nbformat_minor": 0,
...@@ -122,17 +122,34 @@ ...@@ -122,17 +122,34 @@
], ],
"language": "python", "language": "python",
"metadata": {}, "metadata": {},
"outputs": [] "outputs": [],
"prompt_number": 1
}, },
{ {
"cell_type": "code", "cell_type": "code",
"collapsed": false, "collapsed": false,
"input": [ "input": [
"d = dateutil.parser.parse(\"2014 OCT 11 1:2:3\")" "import locale\n",
"locale.setlocale(locale.LC_ALL, \"fr_FR\")\n",
"d = dateutil.parser.parse(\"20 janvier 2004\")"
], ],
"language": "python", "language": "python",
"metadata": {}, "metadata": {},
"outputs": [] "outputs": [
{
"ename": "TypeError",
"evalue": "'NoneType' object is not iterable",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-2-0756678732db>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mlocale\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mlocale\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msetlocale\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlocale\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mLC_ALL\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"fr_FR\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0md\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdateutil\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"20 janvier 2004\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m/home/mat/projects/gargantext/myvenv/lib/python3.4/site-packages/dateutil/parser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(timestr, parserinfo, **kwargs)\u001b[0m\n\u001b[0;32m 746\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparserinfo\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimestr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 747\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 748\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mDEFAULTPARSER\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimestr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 749\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 750\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/mat/projects/gargantext/myvenv/lib/python3.4/site-packages/dateutil/parser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(self, timestr, default, ignoretz, tzinfos, **kwargs)\u001b[0m\n\u001b[0;32m 308\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 309\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 310\u001b[1;33m \u001b[0mres\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mskipped_tokens\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_parse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimestr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 311\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 312\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mres\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mTypeError\u001b[0m: 'NoneType' object is not iterable"
]
}
],
"prompt_number": 2
}, },
{ {
"cell_type": "code", "cell_type": "code",
...@@ -142,7 +159,17 @@ ...@@ -142,7 +159,17 @@
], ],
"language": "python", "language": "python",
"metadata": {}, "metadata": {},
"outputs": [] "outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"'2014-02-02 00:00:00'"
]
}
],
"prompt_number": 7
}, },
{ {
"cell_type": "code", "cell_type": "code",
...@@ -152,7 +179,8 @@ ...@@ -152,7 +179,8 @@
], ],
"language": "python", "language": "python",
"metadata": {}, "metadata": {},
"outputs": [] "outputs": [],
"prompt_number": 8
}, },
{ {
"cell_type": "code", "cell_type": "code",
......
This diff is collapsed.
...@@ -3,12 +3,12 @@ from parsing.NgramsExtractors import * ...@@ -3,12 +3,12 @@ from parsing.NgramsExtractors import *
import collections import collections
import dateutil.parser import dateutil.parser
import zipfile
class NgramCache: class NgramCache:
""" """This allows the fast retrieval of ngram ids
This allows the fast retrieval of ngram ids from a cache instead of calling the database every time
from the cache instead of using the database for every call
""" """
def __init__(self, language): def __init__(self, language):
...@@ -35,9 +35,9 @@ class NgramCaches(collections.defaultdict): ...@@ -35,9 +35,9 @@ class NgramCaches(collections.defaultdict):
"""Base class for performing files parsing depending on their type.
"""
class FileParser: class FileParser:
"""Base class for performing files parsing depending on their type.
"""
def __init__(self, file=None, filepath="", encoding="utf8"): def __init__(self, file=None, filepath="", encoding="utf8"):
# ...get the file item... # ...get the file item...
...@@ -54,11 +54,10 @@ class FileParser: ...@@ -54,11 +54,10 @@ class FileParser:
self._languages_fullname = {language.fullname.lower(): language for language in languages} self._languages_fullname = {language.fullname.lower(): language for language in languages}
self._languages_iso2 = {language.iso2.lower(): language for language in languages} self._languages_iso2 = {language.iso2.lower(): language for language in languages}
self._languages_iso3 = {language.iso3.lower(): language for language in languages} self._languages_iso3 = {language.iso3.lower(): language for language in languages}
#self.parse()
"""Extract the ngrams from a given text.
"""
def extract_ngrams(self, text, language): def extract_ngrams(self, text, language):
"""Extract the ngrams from a given text.
"""
# Get the appropriate ngrams extractor, if it exists # Get the appropriate ngrams extractor, if it exists
if language not in self._extractors: if language not in self._extractors:
extractor = None extractor = None
...@@ -75,20 +74,13 @@ class FileParser: ...@@ -75,20 +74,13 @@ class FileParser:
for ngram in extractor.extract_ngrams(text): for ngram in extractor.extract_ngrams(text):
ngram_text = ' '.join([token for token, tag in ngram]) ngram_text = ' '.join([token for token, tag in ngram])
tokens.append(ngram_text) tokens.append(ngram_text)
return collections.Counter( return collections.Counter(tokens)
# [token for token, tag in extractor.extract_ngrams(text)]
tokens
)
else: else:
return dict() return dict()
#TODO def create_document(self, parentNode, title, metadata, guid=None):
# * make it possible to tag and parse separately """Add a document to the database.
# * only tags some data (only titles, titles & abstracts, some chapters...) """
"""Add a document to the database.
"""
def create_document(self, parentNode, title, contents, language, metadata, guid=None):
metadata = self.format_metadata(metadata) metadata = self.format_metadata(metadata)
# create or retrieve a resource for that document, based on its user id # create or retrieve a resource for that document, based on its user id
# if guid is None: # if guid is None:
...@@ -103,6 +95,10 @@ class FileParser: ...@@ -103,6 +95,10 @@ class FileParser:
# if parentNode.descendants().filter(resource=resource).exists(): # if parentNode.descendants().filter(resource=resource).exists():
# return None # return None
# create the document itself # create the document itself
try:
language = self._languages_iso3[metadata["language_iso3"]]
except:
language = None
childNode = Node( childNode = Node(
user = parentNode.user, user = parentNode.user,
type = self._document_nodetype, type = self._document_nodetype,
...@@ -113,39 +109,74 @@ class FileParser: ...@@ -113,39 +109,74 @@ class FileParser:
parent = parentNode parent = parentNode
) )
childNode.save() childNode.save()
# parse it!
ngrams = self.extract_ngrams(contents, language)
# we are already in a transaction, so no use doing another one (or is there?)
ngramcache = self._ngramcaches[language]
for terms, occurences in ngrams.items():
ngram = ngramcache[terms]
Node_Ngram(
node = childNode,
ngram = ngram,
occurences = occurences
).save()
# return the created document
return childNode return childNode
"""Useful method to detect the document encoding.
Not sure it should be here actually.
"""
def detect_encoding(self, string): def detect_encoding(self, string):
# see the chardet library """Useful method to detect the document encoding.
"""
pass pass
"""Parse the data. def _parse(self, parentNode, file):
This method shall be overriden by inherited classes. """This method shall be overriden by inherited classes."""
"""
def parse(self):
return list() return list()
def parse(self, parentNode, file=None):
"""Parse the files found in the file.
This method shall be overriden by inherited classes.
"""
if file is None:
with transaction.atomic():
self.parse(parentNode, self._file)
if zipfile.is_zipfile(file):
with zipfile.ZipFile(file) as zipArchive:
for filename in zipArchive.namelist():
self.parse(parentNode, zipArchive.open(filename, "r"))
else:
self._parse(parentNode, file)
def extract(self, parentNode, keys):
"""Extract ngrams from the child nodes, given a list of field names."""
# get all the descendants of type "document"
childNodes = parentNode.descendants().filter(type=self._document_nodetype)
with transaction.atomic():
for childNode in childNodes:
# most importantly...
metadata = childNode.metadata
# which extractor shall we use?
if language not in self._extractors:
extractor = None
if language.iso2 == 'en':
# use English
extractor = EnglishNgramsExtractor()
elif language.iso2 == 'fr':
# use French
extractor = FrenchNgramsExtractor()
else:
# no recognized language has been specified...
continue
self._extractors[language] = extractor
# extract ngrams from every field, find the id, count them
ngrams = collections.defaultdict(int)
ngramscache = self._ngramcaches[language]
for key in keys:
for ngram in extractor.extract_ngrams(text):
ngram_text = ' '.join([token for token, tag in ngram])
ngram_id = ngramscache[ngramtext].id
ngrams[ngram_id] += 1
# insert node/ngram associations in the database
for ngram_id, occurences in ngrams.items():
Node_Ngram(
node_id = childNode.id,
ngram_id = ngram_id,
occurences = occurences
).save()
def format_metadata_dates(self, metadata): def format_metadata_dates(self, metadata):
"""Format the dates found in the metadata. """Format the dates found in the metadata.
Example: {"publication_date": "2014-10-23 09:57:42"} -> {...} Examples:
{"publication_date": "2014-10-23 09:57:42"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014"}
""" """
# First, check the split dates... # First, check the split dates...
...@@ -185,8 +216,33 @@ class FileParser: ...@@ -185,8 +216,33 @@ class FileParser:
# finally, return the result! # finally, return the result!
return metadata return metadata
def format_metadata_languages(self, metadata):
"""format the languages found in the metadata."""
try:
if "language_fullname" in metadata:
language = self._languages_fullname[metadata["language_fullname"].lower()]
elif "language_iso3" in metadata:
language = self._languages_iso3[metadata["language_iso3"].lower()]
elif "language_iso2" in metadata:
language = self._languages_iso2[metadata["language_iso2"].lower()]
else:
return metadata
except KeyError:
# the language has not been found
for key in ["language_fullname", "language_iso3", "language_iso2"]:
try:
metadata.pop(key)
except:
continue
return metadata
metadata["language_iso2"] = language.iso2
metadata["language_iso3"] = language.iso3
metadata["language_fullname"] = language.fullname
return metadata
def format_metadata(self, metadata): def format_metadata(self, metadata):
"""Format the metadata.""" """Format the metadata."""
metadata = self.format_metadata_dates(metadata) metadata = self.format_metadata_dates(metadata)
return metadata metadata = self.format_metadata_languages(metadata)
\ No newline at end of file return metadata
...@@ -2,54 +2,47 @@ from django.db import transaction ...@@ -2,54 +2,47 @@ from django.db import transaction
from lxml import etree from lxml import etree
from parsing.FileParsers.FileParser import FileParser from parsing.FileParsers.FileParser import FileParser
from parsing.NgramsExtractors import * from parsing.NgramsExtractors import *
import zipfile
import datetime
class PubmedFileParser(FileParser): class PubmedFileParser(FileParser):
def parse(self, parentNode=None, tag=True): def _parse(self, parentNode, file):
# open the file as XML # open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True) xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
documents = [] xml = etree.parse(file, parser=xml_parser)
xml_articles = xml.findall('PubmedArticle')
with transaction.atomic(): with transaction.atomic():
with zipfile.ZipFile(self._file) as zipFile: # initialize the list of documents
for filename in zipFile.namelist(): documents = []
file = zipFile.open(filename, "r") # parse all the articles, one by one
xml = etree.parse(file, parser=xml_parser) # all database operations should be performed within one transaction
for xml_article in xml_articles:
# parse all the articles, one by one # extract data from the document
# all database operations should be performed within one transaction metadata = {}
xml_articles = xml.findall('PubmedArticle') metadata_path = {
for xml_article in xml_articles: "journal" : 'MedlineCitation/Article/Journal/Title',
# extract data from the document "title" : 'MedlineCitation/Article/ArticleTitle',
metadata = {} "language_iso3" : 'MedlineCitation/Article/Language',
metadata_path = { "doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
"journal" : 'MedlineCitation/Article/Journal/Title', "abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"title" : 'MedlineCitation/Article/ArticleTitle', "publication_year" : 'MedlineCitation/DateCreated/Year',
"language_iso3" : 'MedlineCitation/Article/Language', "publication_month" : 'MedlineCitation/DateCreated/Month',
"doi" : 'PubmedData/ArticleIdList/ArticleId[type=doi]', "publication_day" : 'MedlineCitation/DateCreated/Day',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText', }
"publication_year" : 'MedlineCitation/DateCreated/Year', for key, path in metadata_path.items():
"publication_month" : 'MedlineCitation/DateCreated/Month', try:
"publication_day" : 'MedlineCitation/DateCreated/Day', node = xml_article.find(path)
} metadata[key] = node.text
for key, path in metadata_path.items(): except:
try: metadata[key] = ""
node = xml_article.find(path) contents = metadata["abstract"]
metadata[key] = node.text # create the document in the database
except: document = self.create_document(
metadata[key] = "" parentNode = parentNode,
contents = metadata["abstract"] title = metadata["title"],
# create the document in the database metadata = metadata,
document = self.create_document( #guid = metadata["doi"],
parentNode = parentNode, )
title = metadata["title"], if document:
contents = contents, documents.append(document)
language = self._languages_iso3[metadata["language_iso3"].lower()], # return the list of documents
metadata = metadata, return documents
#guid = metadata["doi"],
)
if document:
documents.append(document)
return documents
from django.db import transaction
from parsing.FileParsers.FileParser import FileParser
class RisFileParser(FileParser):
_parameters = {
}
def _parse(self, parentNode, file):
metadata = {}
last_key = None
last_values = []
with transaction.atomic():
for line in self._file:
if len(line) > 2:
parameter_key = line[:2]
if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters:
parameter = self._parameters[last_key]
if parameter["type"] == "metadata":
separator = parameter["separator"] if "separator" in parameter else ""
metadata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter":
language = self._languages_fullname[metadata["language"].lower()]
# self.create_document(
# parentNode = parentNode,
# title = metadata["title"],
# metadata = metadata,
# guid = metadata["doi"]
# )
print(self.format_metadata(metadata))
print()
metadata = {}
last_key = parameter_key
last_values = []
last_values.append(line[3:-1].decode())
self._file.close()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment