Commit fb17d8c4 authored by Administrator's avatar Administrator

Merge avec la branche de mat (avec ajout test)

parents 4c394a92 95b01d7d
#!/usr/bin/env python
import os
import sys
if __name__ == "__main__":
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext_web.settings")
from django.core.management import execute_from_command_line
execute_from_command_line(sys.argv)
from django.db import transaction
from FileParser import FileParser
class IsiFileParser(FileParser):
def parse(self, parentNode):
# read the file, line by line
for line in self.__file:
# open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
xml = etree.parse(self._file, parser=xml_parser)
# parse all the articles, one by one
# all database operations should be performed within one transaction
xml_articles = xml.findall('PubmedArticle')
with transaction.atomic():
for xml_article in xml_articles:
# extract data from the document
date_year = int(xml_article.find('MedlineCitation/DateCreated/Year').text)
date_month = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
date_day = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
metadata = {
# other metadata should also be included:
# authors, submission date, etc.
"date_pub": datetime.date(year, month, day),
"journal": xml_article.find('MedlineCitation/Article/Journal/Title').text
"title": xml_article.find('MedlineCitation/Article/ArticleTitle').text
"language_iso3": xml_article.find('MedlineCitation/Article/Language').text
"doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]').text
}
contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text
# create the document in the database
yield self.create_document(
parentNode = parentNode
title = metadata["title"],
contents = contents,
language = self._languages_iso3[metadata["language"].lower()]
metadata = metadata,
guid = metadata["doi"],
)
from NgramsExtractors import *
from Taggers import *
texts = [
"This is quite a simple test.",
"Forman Brown (1901–1996) was one of the world's leaders in puppet theatre in his day, as well as an important early gay novelist. He was a member of the Yale Puppeteers and the driving force behind Turnabout Theatre. He was born in Otsego, Michigan, in 1901 and died in 1996, two days after his 95th birthday. Brown briefly taught at North Carolina State College, followed by an extensive tour of Europe.",
"James Patrick (born c. 1940) is the pseudonym of a Scottish sociologist, which he used to publish a book A Glasgow Gang Observed. It attracted some attention in Scotland when it was published in 1973. It was based on research he had done in 1966, when he was aged 26. At that time he was working as a teacher in an Approved School, a Scottish reformatory. One gang member in the school, \"Tim Malloy\" (born 1950, also a pseudonym and a generic term for a Glasgow Catholic), agreed to infiltrate him into his gang in Maryhill in Glasgow. Patrick spent four months as a gang member, observing their behaviour.",
]
tagger = NltkTagger()
extractor = EnglishNgramsExtractor()
# texts = [
# "La saison 1921-1922 du Foot-Ball Club Juventus est la vingtième de l'histoire du club, créé vingt-cinq ans plus tôt en 1897. La société turinoise qui fête cette année son 25e anniversaire prend part à l'édition du championnat dissident d'Italie de la CCI (appelé alors la Première division), la dernière édition d'une compétition annuelle de football avant l'ère fasciste de Mussolini.",
# "Le terme oblong désigne une forme qui est plus longue que large et dont les angles sont arrondis. En langage bibliographique, oblong signifie un format dont la largeur excède la hauteur. Ce qui correspond au format paysage en termes informatiques et \"à l'italienne\", pour l'imprimerie.",
# "Les sanglots longs des violons de l'automne bercent mon coeur d'une langueur monotone.",
# ]
# tagger = TreeTagger()
# extractor = FrenchNgramsExtractor()
for text in texts:
print(tagger.tag_text(text))
print()
ngrams = extractor.extract_ngrams(text)
for ngram in ngrams:
print("\t" + str(ngram))
print("\n")
\ No newline at end of file
...@@ -20,6 +20,14 @@ class NgramCache: ...@@ -20,6 +20,14 @@ class NgramCache:
self._cache[terms] = ngram self._cache[terms] = ngram
return self._cache[terms] return self._cache[terms]
class NgramCaches(collections.defaultdict):
def __missing__(self, language):
self[language] = NgramCache(language)
return self[language]
"""Base class for performing files parsing depending on their type. """Base class for performing files parsing depending on their type.
""" """
...@@ -32,7 +40,7 @@ class FileParser: ...@@ -32,7 +40,7 @@ class FileParser:
else: else:
self._file = file self._file = file
# cache for ngrams # cache for ngrams
self._ngramcaches = collections.defaultdicts(NgramCache) self._ngramcaches = NgramCaches()
# extractors # extractors
self._extractors = dict() self._extractors = dict()
self._document_nodetype = NodeType.get(name='Document') self._document_nodetype = NodeType.get(name='Document')
...@@ -76,7 +84,8 @@ class FileParser: ...@@ -76,7 +84,8 @@ class FileParser:
resource = Resource(guid=guid) resource = Resource(guid=guid)
# If the parent node already has a child with this resource, pass # If the parent node already has a child with this resource, pass
# (is it a good thing?) # (is it a good thing?)
if parentNode.get_descendants(): if parentNode.get_descendants().filter(resource=resource).exists():
return None
# create the document itself # create the document itself
childNode = Node( childNode = Node(
user = parentNode.pk, user = parentNode.pk,
...@@ -90,8 +99,8 @@ class FileParser: ...@@ -90,8 +99,8 @@ class FileParser:
# parse it! # parse it!
ngrams = self.extract_ngrams(contents, language) ngrams = self.extract_ngrams(contents, language)
# we should already be in a transaction, so no use doing another one (or is there?) # we are already in a transaction, so no use doing another one (or is there?)
ngramcache = self._ngramcaches[language.iso3] ngramcache = self._ngramcaches[language]
for terms, occurences in ngrams.items(): for terms, occurences in ngrams.items():
ngram_text = ' '.join([term[0] for term in terms]) ngram_text = ' '.join([term[0] for term in terms])
ngram = ngramcache[ngram_text] ngram = ngramcache[ngram_text]
......
...@@ -11,6 +11,7 @@ class PubmedFileParser(FileParser): ...@@ -11,6 +11,7 @@ class PubmedFileParser(FileParser):
# parse all the articles, one by one # parse all the articles, one by one
# all database operations should be performed within one transaction # all database operations should be performed within one transaction
xml_articles = xml.findall('PubmedArticle') xml_articles = xml.findall('PubmedArticle')
documents = []
with transaction.atomic(): with transaction.atomic():
for xml_article in xml_articles: for xml_article in xml_articles:
# extract data from the document # extract data from the document
...@@ -30,9 +31,14 @@ class PubmedFileParser(FileParser): ...@@ -30,9 +31,14 @@ class PubmedFileParser(FileParser):
# create the document in the database # create the document in the database
yield self.create_document( yield self.create_document(
parentNode = parentNode, parentNode = parentNode,
document = self.create_document(
parentNode = parentNode,
title = metadata["title"], title = metadata["title"],
contents = contents, contents = contents,
language = self._languages_iso3[metadata["language"].lower()], language = self._languages_iso3[metadata["language"].lower()],
metadata = metadata, metadata = metadata,
guid = metadata["doi"], guid = metadata["doi"],
) )
if document:
documents.append(document)
return documents
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment