Commit 9968bfff authored by Administrator's avatar Administrator

isi, ris, pubmed parsers (connecting not finished)

parent 8fe10cc8
...@@ -177,7 +177,7 @@ class Europresse(Document): ...@@ -177,7 +177,7 @@ class Europresse(Document):
'authors': "", 'section': "", 'page':"", 'text': "", 'object_id':""} 'authors': "", 'section': "", 'page':"", 'text': "", 'object_id':""}
count += 1 count += 1
def ajouter(self, project=None, corpus=None, user=None): def add(self, project=None, corpus=None, user=None):
""" Appends notices to self.corpus from self.data removing duplicates""" """ Appends notices to self.corpus from self.data removing duplicates"""
for i in self.data: for i in self.data:
if i['uniqu_id'] not in self.object_ids and isinstance(i['date'], datetime): if i['uniqu_id'] not in self.object_ids and isinstance(i['date'], datetime):
......
...@@ -2,11 +2,13 @@ ...@@ -2,11 +2,13 @@
# import Celery here # import Celery here
from sources.europresse import Europresse from sources.europresse import Europresse
#from sources.isi import Isi from sources.isi import Isi
from sources.pubmed import Pubmed
import zipfile import zipfile
def importer(source, language, zip_file, project=None, corpus=None, user=None): def importer(source, language, zip_file, project=None, corpus=None, user=None):
if source.database == "Europresse": if source.database == "Europresse":
try: try:
c = Europresse() c = Europresse()
...@@ -15,11 +17,50 @@ def importer(source, language, zip_file, project=None, corpus=None, user=None): ...@@ -15,11 +17,50 @@ def importer(source, language, zip_file, project=None, corpus=None, user=None):
for fichiers in z.namelist(): for fichiers in z.namelist():
fichier = z.open(fichiers, 'r') fichier = z.open(fichiers, 'r')
c.parse(fichier) c.parse(fichier)
c.ajouter(project=project, corpus=corpus, user=user) c.add(project=project, corpus=corpus, user=user)
except Exception as e: except Exception as e:
print(e) print(e)
elif source.database == "Isi": elif source.database == "Isi":
try:
c = Isi()
if zipfile.is_zipfile(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
for fichiers in z.namelist():
fichier = z.open(fichiers, 'r')
c.parse(fichier, bdd='isi')
c.add(project=project, corpus=corpus, user=user)
except Exception as e:
print(e)
elif source.database == "Ris":
try:
c = Isi()
if zipfile.is_zipfile(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
for fichiers in z.namelist():
fichier = z.open(fichiers, 'r')
c.parse(fichier)
c.ajouter(project=project, corpus=corpus, user=user)
except Exception as e:
print(e)
elif source.database == "Pubmed":
try:
c = Pubmed()
if zipfile.is_zipfile(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
for fichiers in z.namelist():
fichier = z.open(fichiers, 'r')
c.parse(fichier)
c.ajouter(project=project, corpus=corpus, user=user)
except Exception as e:
print(e)
else:
pass pass
import os, sys
#reload(sys)
import re
import locale
from datetime import datetime, date
from dateutil import parser
#sys.path.append("../../gargantext/")
#from .corpus import Corpus
from documents.models import Document
#TODO:
# use separators in parameters
class Isi() :
"""
Thomson ISI parser
"""
def __init__(self) :
"""
See Corpus class which declare what a corpus is
"""
# Specific declarations for Europresse
self.data = []
self.object_ids = []
def read_param(self,file) :
"""
The file is an init file paramters.
The function returns a dict of parameters for the following parse function.
"""
source = open(file,'r')
lines = source.readlines()
tags={}
for line in lines:
if line[0] != '#':
tag = line.split('\t')
tags[tag[1]] = [tag[0], tag[2]]
return tags
def rules(self, parameters) :
"""
Interpret and does the rules described in parameters.init of each field.
"""
pass
def parse(self, source, bdd='isi') :
"""
The dict needed is parameters, results of read_param function.
The file needed is the file to be parsed in raw text only.
"""
#source = open(file, 'r')
lines = source.readlines()
document = {}
if bdd == 'isi':
parameters = self.read_param('sources/parameters/isi.init')
elif bdd == 'ris':
parameters = self.read_param('sources/parameters/ris.init')
for key in list(parameters.keys()):
if parameters[key][0] == 'BEGIN' :
begin = str(key)
del parameters[begin]
elif parameters[key][0] == 'END' :
end = str(key)
del parameters[end]
for line in lines :
if document == {} and line[:2] == begin :
document['url'] = " "
key = ""
result = ""
elif line[:2] in parameters.keys() :
if key != "" and key != line[:2]:
try:
document[parameters[key][0]] = result
except Exception as e: print(e)
#document.setdefault(parameters[key][0],[]).append(result)
key = line[:2]
result = line[2:].strip()
elif line[:2] == ' ' :
try:
result = result + ' ' + line[2:].strip()#.split(";")
except Exception as error :
pass
elif line[:2] == end :
document[parameters[key][0]] = result
try:
try:
date = document['year'] + " " + document['month']
document['date'] = parser.parse(date)
except:
date = document['year']
document['date'] = datetime.strptime(date, '%Y')
except Exception as e: print('88', e)
self.data.append(document)
document = {}
def add(self, project=None, corpus=None, user=None):
""" Appends notices to self.corpus from self.data removing duplicates"""
for i in self.data:
if i['uniqu_id'] not in self.object_ids and isinstance(i['date'], datetime):
self.object_ids.append(i['uniqu_id'])
doc = Document()
doc.project = project
doc.user = user
doc.date = i['date']
doc.uniqu_id= i['uniqu_id']
doc.title = i['title']
print(doc.project)
doc.source = i['source']
doc.authors = i['authors']
doc.text = i['text']
doc.save()
doc.corpus.add(corpus)
self.data = []
def demo():
import sys
data = Isi()
data.add(parameters=param, file=sys.argv[1])
if __name__ == "__main__" :
try:
demo()
except Exception as error :
print(sys.exc_traceback.tb_lineno, error)
##############################################################################
# LEGEND:
# NAME (what you want[1]) FIELD (see your data) SEPARATORS (see your data)
#
# [1]
# Be careful to these names variables which do not have to change:
# BEGIN, ID-unique, END
##############################################################################
BEGIN PT ""
authors AU \n
AF AF "\n"
title TI ""
source SO "\n"
language LA ""
DT DT ""
keywords DE ;
ID ID ;
text AB
ISIC1 C1 \n
reprint_author RP ,
email EM \n
thanks FX
CR CR \n
number NR \n
TC TC ""
Z9 Z9 ""
PU PU ""
PI PI ""
PA PA ""
SN SN ""
journal_small J9 ""
JI JI ""
month PD ""
year PY ""
volume VL ""
IS IS ""
BP BP ""
EP EP ""
DOI DI ""
page PG ""
field WC ""
SC SC ""
GA GA ""
object_id UT ""
END ER ""
tag_begin PMID-
tag_end $ligne$
longueur_tag 6
condition_debut_tag \S\S\s
AU - ISIAUTHOR $ligne$
TI - ISITITLE
PT - ISIDT
MH - ISIkeyword ;
FAU - ISIAF $ligne$
TA - ISIJOURNAL
JT - ISIJOURNALFull
RN - ISIID ;
AB - ISIABSTRACT
AD - ISIC1 $ligne$ ***, [***]
AD - ISIRP ,
AD - ISIFU ;
nope- ISICR $ligne$ ,
SO - ISITC
JT - ISISO
DA - ISIpubdate
VI - ISIVolume
PG - ISIPage
MH - ISISC ;
PMID- ISIUT
PMID- ISIDOI
#!/usr/bin/env python
# *coding:Utf8*
"""
Pubmed Database parser
__author__ : http://alexandre.delanoe.org
__licence__ : GPL version 3.0+
__DATE__ : 2014
__VERSION__ : 0.1
"""
import datetime
import sys, string, codecs
from lxml import etree
from documents.models import Document
class Pubmed() :
"""
Pubmed, Medline corpus parser
"""
def __init__(self) :
"""
See Corpus class which declares what a corpus is
"""
Corpus.__init__(self)
self.bdd = "Medline"
# class Article(Text):
# def __init__(self) :
# Text.__init__(self)
def parse(self, file, bdd="PUBMED") :
"""
The file needed is the file to be parsed in xml format.
The bdd is the filed of BDD-SOURCE.
"""
document = {}
source = open(file, 'r')
parser = etree.XMLParser(resolve_entities=False,recover=True)
xml = etree.parse(source, parser=parser)
xml_docs = xml.findall('PubmedArticle/MedlineCitation')
for xml_doc in xml_docs:
year = int(xml_doc.find('DateCreated/Year').text)
month = int(xml_doc.find('DateCreated/Month').text)
day = int(xml_doc.find('DateCreated/Day').text)
self.Article.date = datetime.date(year, month, day)
self.Article.journal = xml_doc.find('Article/Journal/Title').text
self.Article.title = xml_doc.find('Article/ArticleTitle').text
self.texts.append(self.Article)
# if xmlDoc.find("PubmedArticle") is not None :
# print ok
def add(self, file):
self.parse(file)
def demo(file):
data = Pubmed()
#data.parse(file='../data/pubmed/pubmed_result.xml')
data.parse(file)
print(data.texts[0])
# for i in data.keys():
# print i
if __name__ == "__main__" :
try:
demo()
except Exception as error :
print(error)
#
#<PubmedArticle>
# <MedlineCitation Status="Publisher" Owner="NLM">
# <PMID Version="1">24363549</PMID>
# <DateCreated>
# <Year>2013</Year>
# <Month>12</Month>
# <Day>23</Day>
# </DateCreated>
# <Article PubModel="Print-Electronic">
# <Journal>
# <ISSN IssnType="Print">1080-7039</ISSN>
# <JournalIssue CitedMedium="Print">
# <Volume>20</Volume>
# <Issue>2</Issue>
# <PubDate>
# <Year>2014</Year>
# <Month>Feb</Month>
# </PubDate>
# </JournalIssue>
# <Title>Human and ecological risk assessment : HERA</Title>
# <ISOAbbreviation>Hum Ecol Risk Assess</ISOAbbreviation>
# </Journal>
# <ArticleTitle>A Causal Analysis of Observed Declines in Managed Honey Bees (Apis mellifera).</ArticleTitle>
# <Pagination>
# <MedlinePgn>566-591</MedlinePgn>
# </Pagination>
# <Abstract>
# <AbstractText NlmCategory="UNLABELLED">The European honey bee (Apis mellifera) is a highly valuable, semi-free-ranging managed agricultural species. While the number of managed hives has been increasing, declines in overwinter survival, and the onset of colony collapse disorder in 2006, precipitated a large amount of research on bees' health in an effort to isolate the causative factors. A workshop was convened during which bee experts were introduced to a formal causal analysis approach to compare 39 candidate causes against specified criteria to evaluate their relationship to the reduced overwinter survivability observed since 2006 of commercial bees used in the California almond industry. Candidate causes were categorized as probable, possible, or unlikely; several candidate causes were categorized as indeterminate due to lack of information. Due to time limitations, a full causal analysis was not completed at the workshop. In this article, examples are provided to illustrate the process and provide preliminary findings, using three candidate causes. Varroa mites plus viruses were judged to be a &quot;probable cause&quot; of the reduced survival, while nutrient deficiency was judged to be a &quot;possible cause.&quot; Neonicotinoid pesticides were judged to be &quot;unlikely&quot; as the sole cause of this reduced survival, although they could possibly be a contributing factor.</AbstractText>
# </Abstract>
# <AuthorList>
# <Author>
# <LastName>Staveley</LastName>
# <ForeName>Jane P</ForeName>
# <Initials>JP</Initials>
# <Affiliation>Exponent, Alexandria, VA, USA.</Affiliation>
# </Author>
# <Author>
# <LastName>Law</LastName>
# <ForeName>Sheryl A</ForeName>
# <Initials>SA</Initials>
# <Affiliation>Exponent, Alexandria, VA, USA.</Affiliation>
# </Author>
# <Author>
# <LastName>Fairbrother</LastName>
# <ForeName>Anne</ForeName>
# <Initials>A</Initials>
# <Affiliation>Exponent, Bellevue, WA, USA.</Affiliation>
# </Author>
# <Author>
# <LastName>Menzie</LastName>
# <ForeName>Charles A</ForeName>
# <Initials>CA</Initials>
# <Affiliation>Exponent, Alexandria, VA, USA.</Affiliation>
# </Author>
# </AuthorList>
# <Language>ENG</Language>
# <PublicationTypeList>
# <PublicationType>JOURNAL ARTICLE</PublicationType>
# </PublicationTypeList>
# <ArticleDate DateType="Electronic">
# <Year>2013</Year>
# <Month>11</Month>
# <Day>25</Day>
# </ArticleDate>
# </Article>
# <MedlineJournalInfo>
# <MedlineTA>Hum Ecol Risk Assess</MedlineTA>
# <NlmUniqueID>9513572</NlmUniqueID>
# <ISSNLinking>1080-7039</ISSNLinking>
# </MedlineJournalInfo>
# <KeywordList Owner="NOTNLM">
# <Keyword MajorTopicYN="N">Varroa</Keyword>
# <Keyword MajorTopicYN="N">causal analysis</Keyword>
# <Keyword MajorTopicYN="N">honey bees</Keyword>
# <Keyword MajorTopicYN="N">neonicotinoids</Keyword>
# </KeywordList>
# </MedlineCitation>
# <PubmedData>
# <History>
# <PubMedPubDate PubStatus="received">
# <Year>2013</Year>
# <Month>7</Month>
# <Day>8</Day>
# </PubMedPubDate>
# <PubMedPubDate PubStatus="accepted">
# <Year>2013</Year>
# <Month>7</Month>
# <Day>23</Day>
# </PubMedPubDate>
# <PubMedPubDate PubStatus="epublish">
# <Year>2013</Year>
# <Month>11</Month>
# <Day>25</Day>
# </PubMedPubDate>
# <PubMedPubDate PubStatus="entrez">
# <Year>2013</Year>
# <Month>12</Month>
# <Day>24</Day>
# <Hour>6</Hour>
# <Minute>0</Minute>
# </PubMedPubDate>
# <PubMedPubDate PubStatus="pubmed">
# <Year>2013</Year>
# <Month>12</Month>
# <Day>24</Day>
# <Hour>6</Hour>
# <Minute>0</Minute>
# </PubMedPubDate>
# <PubMedPubDate PubStatus="medline">
# <Year>2013</Year>
# <Month>12</Month>
# <Day>24</Day>
# <Hour>6</Hour>
# <Minute>0</Minute>
# </PubMedPubDate>
# </History>
# <PublicationStatus>ppublish</PublicationStatus>
# <ArticleIdList>
# <ArticleId IdType="doi">10.1080/10807039.2013.831263</ArticleId>
# <ArticleId IdType="pubmed">24363549</ArticleId>
# <ArticleId IdType="pmc">PMC3869053</ArticleId>
# </ArticleIdList>
# <?pmcsd?>
# </PubmedData>
#</PubmedArticle>
#
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment