Commit b811a2bb authored by Administrator's avatar Administrator

Merge branch 'master' into alex

Intégration Master
parents f76b6997 845d37bb
{
"metadata": {
"name": "",
"signature": "sha256:e0c3b2efe7c205a29dc4e028b10ffb7b9d0569f35c4b426febdf523069abffdb"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pprint import pprint\n",
"from node.models import Node, NodeType, Language, Ngram\n",
"from django.contrib.auth.models import User\n",
"import parsing\n",
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Define user\n",
"try:\n",
" user = User.objects.get(username='Mat')\n",
"except:\n",
" user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
" user.save()\n",
"\n",
"# Define document types\n",
"nodetypes = {}\n",
"for name in ['Corpus', 'Document']:\n",
" try:\n",
" nodetypes[name] = NodeType.objects.get(name=name)\n",
" except:\n",
" nodetypes[name] = NodeType(name=name)\n",
" nodetypes[name].save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()\n",
"corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
"corpus.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser.parse(corpus)\n",
"print('Ok!')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
}
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for node_ngram in corpus.children.first().node_ngram_set.all():\n",
" print(node_ngram.ngram.terms)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
\ No newline at end of file
{
"metadata": {
"name": "",
"signature": "sha256:eac7c9b22e240bb0ef6d0aeec21261194d84a3f0ba53cd02af69f80d30ec5a17"
"signature": "sha256:70c2c8a4c8089e61195ee9da9232043152cf5e6c658a32115c0dcf990c2e98af"
},
"nbformat": 3,
"nbformat_minor": 0,
......@@ -122,17 +122,34 @@
],
"language": "python",
"metadata": {},
"outputs": []
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"d = dateutil.parser.parse(\"2014 OCT 11 1:2:3\")"
"import locale\n",
"locale.setlocale(locale.LC_ALL, \"fr_FR\")\n",
"d = dateutil.parser.parse(\"20 janvier 2004\")"
],
"language": "python",
"metadata": {},
"outputs": []
"outputs": [
{
"ename": "TypeError",
"evalue": "'NoneType' object is not iterable",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-2-0756678732db>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mlocale\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mlocale\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msetlocale\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlocale\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mLC_ALL\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"fr_FR\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0md\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdateutil\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"20 janvier 2004\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m/home/mat/projects/gargantext/myvenv/lib/python3.4/site-packages/dateutil/parser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(timestr, parserinfo, **kwargs)\u001b[0m\n\u001b[0;32m 746\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparserinfo\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimestr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 747\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 748\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mDEFAULTPARSER\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimestr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 749\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 750\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/mat/projects/gargantext/myvenv/lib/python3.4/site-packages/dateutil/parser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(self, timestr, default, ignoretz, tzinfos, **kwargs)\u001b[0m\n\u001b[0;32m 308\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 309\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 310\u001b[1;33m \u001b[0mres\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mskipped_tokens\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_parse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimestr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 311\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 312\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mres\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mTypeError\u001b[0m: 'NoneType' object is not iterable"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
......@@ -142,7 +159,17 @@
],
"language": "python",
"metadata": {},
"outputs": []
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"'2014-02-02 00:00:00'"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
......@@ -152,7 +179,8 @@
],
"language": "python",
"metadata": {},
"outputs": []
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
......
This diff is collapsed.
from django.db import models
from django.utils import timezone
from django.contrib.auth.models import User
from django_hstore import hstore
from cte_tree.models import CTENode, Manager
#from cte_tree.fields import DepthField, PathField, OrderingField
from parsing.Caches import LanguagesCache, NgramsExtractorsCache, NgramsCaches
from parsing.FileParsers import *
from time import time
from collections import defaultdict
from django.contrib.auth.models import User
from collections import defaultdict
# Some usefull functions
# TODO: start the function name with an underscore (private)
def upload_to(instance, filename):
return 'corpora/%s/%s' % (instance.user.username, filename)
#return 'corpora/%s/%f/%s' % (instance.user.username, time(), filename)
......@@ -28,35 +31,50 @@ class Language(models.Model):
def __str__(self):
return self.fullname
class DatabaseType(models.Model):
class ResourceType(models.Model):
name = models.CharField(max_length=255)
def __str__(self):
return self.name
class Ngram(models.Model):
language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL)
n = models.IntegerField()
terms = models.CharField(max_length=255)
class Resource(models.Model):
user = models.ForeignKey(User)
guid = models.CharField(max_length=255)
bdd_type = models.ForeignKey(DatabaseType, blank=True, null=True)
type = models.ForeignKey(ResourceType, blank=True, null=True)
file = models.FileField(upload_to=upload_to, blank=True)
def __str__(self):
return "%s => %s" % (self.bdd_type, self.file)
digest = models.CharField(max_length=32) # MD5 digest
class NodeType(models.Model):
name = models.CharField(max_length=200)
def __str__(self):
return self.name
class Ngram(models.Model):
language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL)
n = models.IntegerField()
terms = models.CharField(max_length=255)
def __str__(self):
return "[%d] %s" % (self.pk, self.terms)
class NodeQuerySet(models.query.QuerySet):
"""Methods available from Node querysets."""
def extract_ngrams(self, keys, ngramsextractorscache=None, ngramscaches=None):
if ngramsextractorscache is None:
ngramsextractorscache = NgramsExtractorsCache()
if ngramscaches is None:
ngramscaches = NgramsCaches()
for node in self:
node.extract_ngrams(keys, ngramsextractorscache, ngramscaches)
class NodeManager(models.Manager):
"""Methods available from Node.object."""
def get_queryset(self):
return NodeQuerySet(self.model)
def __getattr__(self, name, *args):
if name.startswith("_"):
raise AttributeError
return getattr(self.get_queryset(), name, *args)
class Node(CTENode):
objects = Manager()
"""The node."""
objects = NodeManager()
user = models.ForeignKey(User)
type = models.ForeignKey(NodeType)
......@@ -66,30 +84,104 @@ class Node(CTENode):
date = models.DateField(default=timezone.now, blank=True)
metadata = hstore.DictionaryField(blank=True)
resource = models.ManyToManyField(Resource, blank=True)
ngrams = models.ManyToManyField(Ngram, blank=True, help_text="Hold down")
# TODO: remove the three following fields
fichier = models.FileField(upload_to=upload_to, blank=True)
#resource = models.ForeignKey(Resource, blank=True, null=True)
#ngrams = models.ManyToManyField(NGrams)
def __str__(self):
return self.name
def liste(self, user):
for noeud in Node.objects.filter(user=user):
print(noeud.depth * " " + "[%d] %d" % (noeud.pk, noeud.name))
def add_resource(self, **kwargs):
resource = Resource(**kwargs)
# TODO: vérifier si tous ces 'save' sont réellement utiles
resource.save()
node_resource = Node_Resource(
node = self,
resource = resource
)
node_resource.save()
return resource
def parse_resources(self):
# parse all resources into a list of metadata
metadata_list = []
for node_resource in self.node_resource.filter(parsed=False):
resource = node_resource.resource
parser = defaultdict(lambda:FileParser.FileParser, {
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
})[resource.type.name]()
metadata_list += parser.parse(str(resource.file))
# insert the new resources in the database!
type = NodeType.objects.get(name='Document')
langages_cache = LanguagesCache()
Node.objects.bulk_create([
Node(
user = self.user,
type = type,
name = metadata['title'] if 'title' in metadata else '',
parent = self,
language = langages_cache[metadata['language_iso2']] if 'language_iso2' in metadata else None,
metadata = metadata,
)
for metadata in metadata_list
])
# mark the resources as parsed for this node
self.node_resource.update(parsed=True)
def extract_ngrams(self, keys, ngramsextractorscache=None, ngramscaches=None):
# if there is no cache...
if ngramsextractorscache is None:
ngramsextractorscache = NgramsExtractorsCache()
if ngramscaches is None:
ngramscaches = NgramsCaches()
# what do we want from the cache?
extractor = ngramsextractorscache[self.language]
ngrams = ngramscaches[self.language]
# find & count all the occurrences
associations = defaultdict(float) # float or int?
if isinstance(keys, dict):
for key, weight in keys.items():
for ngram in extractor.extract_ngrams(self.metadata[key]):
terms = ' '.join([token for token, tag in ngram])
associations[ngram] += weight
else:
for key in keys:
for ngram in extractor.extract_ngrams(self.metadata[key]):
terms = ' '.join([token for token, tag in ngram])
associations[terms] += 1
# insert the occurrences in the database
Node_Ngram.objects.bulk_create([
Node_Ngram(
node = self,
ngram = ngrams[ngram_text],
weight = weight
)
for ngram_text, weight in associations.items()
])
class Node_Resource(models.Model):
node = models.ForeignKey(Node, related_name='node_resource')
resource = models.ForeignKey(Resource)
parsed = models.BooleanField(default=False)
class Node_Ngram(models.Model):
node = models.ForeignKey(Node)
ngram = models.ForeignKey(Ngram)
weight = models.FloatField()
class Project(Node):
class Meta:
proxy=True
class CorpusManager(models.Manager):
def get_query_set(self):
corpus_type = NodeType.objects.get(name='Corpus')
return super(CorpusManager, self).get_query_set().filter(type=corpus_type)
class Corpus(Node):
objects = CorpusManager()
class Meta:
proxy=True
verbose_name_plural = 'Corpora'
......@@ -98,28 +190,4 @@ class Document(Node):
class Meta:
proxy=True
############################
# NGRAMS
############################
class Node_Ngram(models.Model):
node = models.ForeignKey(Node, on_delete=models.CASCADE)
ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE)
occurences = models.IntegerField()
def __str__(self):
return "%s: %s" % (self.node.name, self.ngram.terms)
class NodeNgramNgram(models.Model):
node = models.ForeignKey(Node)
ngramX = models.ForeignKey(Ngram, related_name="nodengramngramx", on_delete=models.CASCADE)
ngramY = models.ForeignKey(Ngram, related_name="nodengramngramy", on_delete=models.CASCADE)
score = models.FloatField(default=0)
def __str__(self):
return "%s: %s / %s" % (self.node.name, self.ngramX.terms, self.ngramY.terms)
import node.models
from parsing.NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor
from collections import defaultdict
class NgramsCache(defaultdict):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time.
This class is language-specific."""
def __init__(self, language):
"""The cache only works with one language,
which is the required parameter of the constructor."""
self.language = language
def __missing__(self, terms):
"""If the terms are not yet present in the dictionary,
retrieve it from the database or insert it."""
try:
ngram = node.models.Ngram.get(terms=terms, language=self.language)
except:
ngram = node.models.Ngram(terms=terms, n=len(terms.split()), language=self.language)
ngram.save()
self[terms] = ngram
return self[terms]
class NgramsCaches(defaultdict):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time."""
def __missing__(self, language):
"""If the cache for this language is not reachable,
add id to the dictionary."""
self[language] = NgramsCache(language)
return self[language]
class NgramsExtractorsCache(defaultdict):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time."""
def __missing__(self, key):
"""If the ngrams extractor is not instancianted yet
for the given language, do it!"""
# format the language
if isinstance(key, str):
language = key.strip().lower()
elif key:
language = key.iso2
else:
language = None
# find the proper extractor
if language in ["en", "eng", "english"]:
Extractor = EnglishNgramsExtractor
elif language in ["fr", "fra", "fre", "french"]:
Extractor = FrenchNgramsExtractor
else:
Extractor = NgramsExtractor
# try to see if already instanciated with another key
found = False
for extractor in self.values():
if type(extractor) == Extractor:
self[key] = extractor
found = True
break
# well if not, let's instanciate it...
if not found:
self[key] = Extractor()
# return the proper extractor
return self[key]
class LanguagesCache(defaultdict):
def __missing__(self, key):
if len(self) == 0:
for language in node.models.Language.objects.all():
self[str(language.iso2.lower())] = language
self[str(language.iso3.lower())] = language
self[str(language.fullname.lower())] = language
betterKey = key.strip().lower()
self[key] = self[betterKey] if betterKey in self.keys() else None
return self[betterKey]
class Caches:
"""This is THE cache of the caches.
See NgramsCaches and NgramsExtractorsCache for better understanding."""
def __init__(self):
self.ngrams = NgramsCaches()
self.extractors = NgramsExtractorsCache()
self.languages = LanguagesCache()
from node.models import Node, NodeType, Language, Ngram, Node_Ngram
from parsing.NgramsExtractors import *
import collections
import dateutil.parser
import zipfile
class NgramCache:
"""
This allows the fast retrieval of ngram ids
from the cache instead of using the database for every call
"""
def __init__(self, language):
self._cache = dict()
self._language = language
def __getitem__(self, terms):
terms = terms.strip().lower()
if terms not in self._cache:
try:
ngram = Ngram.get(terms=terms, language=self._language)
except:
ngram = Ngram(terms=terms, n=len(terms.split()), language=self._language)
ngram.save()
self._cache[terms] = ngram
return self._cache[terms]
class NgramCaches(collections.defaultdict):
def __missing__(self, language):
self[language] = NgramCache(language)
return self[language]
from parsing.Caches import LanguagesCache
"""Base class for performing files parsing depending on their type.
"""
class FileParser:
def __init__(self, file=None, filepath="", encoding="utf8"):
# ...get the file item...
if file is None:
self._file = open(filepath, "rb")
else:
self._file = file
# cache for ngrams
self._ngramcaches = NgramCaches()
# extractors
self._extractors = dict()
self._document_nodetype = NodeType.objects.get(name='Document')
languages = Language.objects.all()
self._languages_fullname = {language.fullname.lower(): language for language in languages}
self._languages_iso2 = {language.iso2.lower(): language for language in languages}
self._languages_iso3 = {language.iso3.lower(): language for language in languages}
#self.parse()
"""Extract the ngrams from a given text.
"""Base class for performing files parsing depending on their type.
"""
def extract_ngrams(self, text, language):
# Get the appropriate ngrams extractor, if it exists
if language not in self._extractors:
extractor = None
if language.iso2 == 'en':
extractor = EnglishNgramsExtractor()
elif language.iso2 == 'fr':
extractor = FrenchNgramsExtractor()
self._extractors[language] = extractor
else:
extractor = self._extractors[language]
# Extract the ngrams
if extractor:
tokens = []
for ngram in extractor.extract_ngrams(text):
ngram_text = ' '.join([token for token, tag in ngram])
tokens.append(ngram_text)
return collections.Counter(
# [token for token, tag in extractor.extract_ngrams(text)]
tokens
)
else:
return dict()
def __init__(self, language_cache=None):
self._languages_cache = LanguagesCache() if language_cache is None else language_cache
#TODO
# * make it possible to tag and parse separately
# * only tags some data (only titles, titles & abstracts, some chapters...)
"""Add a document to the database.
"""
def create_document(self, parentNode, title, contents, language, metadata, guid=None):
metadata = self.format_metadata(metadata)
# create or retrieve a resource for that document, based on its user id
# if guid is None:
# resource = Resource(guid=guid)
# else:
# try:
# resource = Resource.get(guid=guid)
# except:
# resource = Resource(guid=guid)
# # If the parent node already has a child with this resource, pass
# # (is it a good thing?)
# if parentNode.descendants().filter(resource=resource).exists():
# return None
# create the document itself
if len(title) > 200:
title = title[:200]
childNode = Node(
user = parentNode.user,
type = self._document_nodetype,
name = title,
language = language,
metadata = metadata,
#resource = resource,
parent = parentNode
)
childNode.save()
# parse it!
ngrams = self.extract_ngrams(contents, language)
# we are already in a transaction, so no use doing another one (or is there?)
ngramcache = self._ngramcaches[language]
for terms, occurences in ngrams.items():
ngram = ngramcache[terms]
Node_Ngram(
node = childNode,
ngram = ngram,
occurences = occurences
).save()
# return the created document
return childNode
"""Useful method to detect the document encoding.
Not sure it should be here actually.
"""
def detect_encoding(self, string):
# see the chardet library
"""Useful method to detect the document encoding.
"""
pass
"""Parse the data.
This method shall be overriden by inherited classes.
"""
def parse(self):
return list()
def format_metadata_dates(self, metadata):
"""Format the dates found in the metadata.
Example: {"publication_date": "2014-10-23 09:57:42"} -> {...}
Examples:
{"publication_date": "2014-10-23 09:57:42"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014", ...}
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
"""
# First, check the split dates...
......@@ -187,10 +62,49 @@ class FileParser:
metadata[prefix + "_minute"] = date.strftime("%M")
metadata[prefix + "_second"] = date.strftime("%S")
# finally, return the result!
# finally, return the transformed result!
return metadata
def format_metadata_languages(self, metadata):
"""format the languages found in the metadata."""
language = None
for key in ["fullname", "iso3", "iso2"]:
language_key = "language_" + key
if language_key in metadata:
language_symbol = metadata[language_key]
language = self._languages_cache[language_symbol]
if language:
break
if language:
metadata["language_iso2"] = language.iso2
metadata["language_iso3"] = language.iso3
metadata["language_fullname"] = language.fullname
return metadata
def format_metadata(self, metadata):
"""Format the metadata."""
metadata = self.format_metadata_dates(metadata)
metadata = self.format_metadata_languages(metadata)
return metadata
def _parse(self, file):
"""This method shall be overriden by inherited classes."""
return list()
def parse(self, file):
"""Parse the file, and its children files found in the file.
"""
# initialize the list of metadata
metadata_list = []
# is the file is a ZIP archive, recurse on each of its files...
if zipfile.is_zipfile(file):
zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist():
metadata_list += self.parse(zipArchive.open(filename, "r"))
# ...otherwise, let's parse it directly!
else:
metadata_list += self._parse(file)
# return the list of formatted metadata
return map(self.format_metadata, metadata_list)
......@@ -2,54 +2,36 @@ from django.db import transaction
from lxml import etree
from parsing.FileParsers.FileParser import FileParser
from parsing.NgramsExtractors import *
import zipfile
import datetime
class PubmedFileParser(FileParser):
def parse(self, parentNode=None, tag=True):
def _parse(self, file):
# open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
documents = []
with transaction.atomic():
with zipfile.ZipFile(self._file) as zipFile:
for filename in zipFile.namelist():
file = zipFile.open(filename, "r")
xml = etree.parse(file, parser=xml_parser)
# parse all the articles, one by one
# all database operations should be performed within one transaction
xml_articles = xml.findall('PubmedArticle')
for xml_article in xml_articles:
# extract data from the document
metadata = {}
metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
"language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[type=doi]',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"publication_year" : 'MedlineCitation/DateCreated/Year',
"publication_month" : 'MedlineCitation/DateCreated/Month',
"publication_day" : 'MedlineCitation/DateCreated/Day',
}
for key, path in metadata_path.items():
try:
node = xml_article.find(path)
metadata[key] = node.text
except:
metadata[key] = ""
contents = metadata["abstract"]
# create the document in the database
document = self.create_document(
parentNode = parentNode,
title = metadata["title"],
contents = contents,
language = self._languages_iso3[metadata["language_iso3"].lower()],
metadata = metadata,
#guid = metadata["doi"],
)
if document:
documents.append(document)
return documents
xml = etree.parse(file, parser=xml_parser)
xml_articles = xml.findall('PubmedArticle')
# initialize the list of metadata
metadata_list = []
# parse all the articles, one by one
for xml_article in xml_articles:
# extract data from the document
metadata = {}
metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
"language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"publication_year" : 'MedlineCitation/DateCreated/Year',
"publication_month" : 'MedlineCitation/DateCreated/Month',
"publication_day" : 'MedlineCitation/DateCreated/Day',
}
for key, path in metadata_path.items():
try:
node = xml_article.find(path)
metadata[key] = node.text
except:
metadata[key] = ""
metadata_list.append(metadata)
# return the list of metadata
return metadata_list
......@@ -7,33 +7,24 @@ class RisFileParser(FileParser):
_parameters = {
}
def _parse(self, parentNode, file):
def _parse(self, file):
metadata_list = []
metadata = {}
last_key = None
last_values = []
with transaction.atomic():
for line in self._file:
if len(line) > 2:
parameter_key = line[:2]
if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters:
parameter = self._parameters[last_key]
if parameter["type"] == "metadata":
separator = parameter["separator"] if "separator" in parameter else ""
metadata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter":
language = self._languages_fullname[metadata["language"].lower()]
self.create_document(
parentNode = parentNode,
title = metadata["title"],
metadata = metadata,
guid = metadata["doi"]
)
# print(self.format_metadata(metadata))
# print()
metadata = {}
last_key = parameter_key
last_values = []
last_values.append(line[3:-1].decode())
self._file.close()
for line in self._file:
if len(line) > 2:
parameter_key = line[:2]
if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters:
parameter = self._parameters[last_key]
if parameter["type"] == "metadata":
separator = parameter["separator"] if "separator" in parameter else ""
metadata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter":
language = self._languages_fullname[metadata["language"].lower()]
metadata_list.append(metadata)
last_key = parameter_key
last_values = []
last_values.append(line[3:-1].decode())
return metadata_list
......@@ -7,3 +7,4 @@ class EnglishNgramsExtractor(NgramsExtractor):
def start(self):
self.tagger = NltkTagger()
\ No newline at end of file
#from NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
#from NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
from parsing.NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
from parsing.NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
from parsing.NgramsExtractors.NgramsExtractor import NgramsExtractor
......@@ -46,7 +46,7 @@ Shall be used for french texts.
"""
class TreeTagger(Tagger):
def start(self, treeTaggerPath = "./Taggers/treetagger"):
def start(self, treeTaggerPath = "./parsing/Taggers/treetagger"):
binaryFile = "%s/bin/tree-tagger" % treeTaggerPath
tagcmdlist = [
binaryFile,
......
from parsing.Taggers.Tagger import Tagger
from parsing.Taggers.NltkTagger import NltkTagger
from parsing.Taggers.TreeTagger import TreeTagger
#from .Taggers import *
#from .NgramsExtractors import *
from .FileParsers import *
from node.models import Node, NodeType
import zipfile
import collections
# import chardet
class Parser:
def __init__(self):
pass
def parse_file(self, file):
# CHECKER GUID!!!!!!!!!!!!!!!!!!!!!!!!!!!!
pass
def parse_node_fichier(self, node):
if node.fichier and zipfile.is_zipfile(node.fichier):
with zipfile.ZipFile(node.fichier, "r") as zipFile:
node_type = NodeType.objects.get(name="Document")
for filename in zipFile.namelist():
file = zipFile.open(filename, "r")
node.objects.create(
parent = node,
type = node_type,
user = node.user,
)
def parse_node(self, node):
for resource in node.resources:
if node.resources.file and zipfile.is_zipfile(node.resources.file):
with zipfile.ZipFile(node.resources.file, "r") as zipFile:
for filename in zipFile.namelist():
file = zipFile.open(filename, "r")
Node.objects.create(
parent = node,
type = NodeType.get(name="Document"),
user = node.user,
)
def parse_node_recursively(self, node):
self.parse_node(node)
for descendant in node.get_descendants():
self.parse_node(descendant)
from node.models import Node, NodeType, User, Language, ResourceType
from parsing.Caches import Caches
try:
me = User.objects.get(username='Mat')
except:
me = User(username='Mat')
me.save()
try:
typePubmed = ResourceType.get(name='pubmed')
except:
typePubmed = ResourceType(name='pubmed')
typePubmed.save()
try:
typeCorpus = NodeType.get(name='corpus')
typeDoc = NodeType.get(name='document')
except:
typeCorpus = NodeType(name='corpus')
typeCorpus.save()
typeDoc = NodeType(name='document')
typeDoc.save()
english = Language.objects.get(iso2='en')
Node.objects.all().delete()
try:
corpus = Node.objects.get(name='My first corpus')
except:
corpus = Node(name='My first corpus', type=typeCorpus, user=me)
corpus.save()
print('Remove previously existing children of the corpus...')
corpus.children.all().delete()
print('Adding a resource to the corpus...')
corpus.add_resource(file='./data_samples/pubmed.zip', type=typePubmed)
print('Adding the corpus resources...')
corpus.parse_resources()
print('Extracting ngrams from the documents...')
corpus.children.all().extract_ngrams(['title', 'abstract'])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment