isi, ris, pubmed parsers (connecting not finished)

9968bfff · Administrator · 8fe10cc8 · 9968bfff · 9968bfff · 9968bfff
Commit 9968bfff authored Oct 01, 2014 by Administrator
6 changed files
--- a/sources/europresse.py
+++ b/sources/europresse.py
@@ -177,7 +177,7 @@ class Europresse(Document):
                        'authors': "", 'section': "", 'page':"", 'text': "", 'object_id':""}
                count += 1

-    def ajouter(self, project=None, corpus=None, user=None):
+    def add(self, project=None, corpus=None, user=None):
        """ Appends notices to self.corpus from self.data removing duplicates"""
        for i in self.data:
            if i['uniqu_id'] not in self.object_ids and isinstance(i['date'], datetime):

--- a/sources/importateur.py
+++ b/sources/importateur.py
@@ -2,11 +2,13 @@
 # import Celery here

 from sources.europresse import Europresse
-#from sources.isi import Isi
+from sources.isi import Isi
+from sources.pubmed import Pubmed

 import zipfile

 def importer(source, language, zip_file, project=None, corpus=None, user=None):
+    
    if source.database == "Europresse":
        try:
            c = Europresse()
@@ -15,11 +17,50 @@ def importer(source, language, zip_file, project=None, corpus=None, user=None):
                    for fichiers in z.namelist():
                        fichier = z.open(fichiers, 'r')
                        c.parse(fichier)
-                        c.ajouter(project=project, corpus=corpus, user=user)
+                        c.add(project=project, corpus=corpus, user=user)

        except Exception as e:
            print(e)
+    
    elif source.database == "Isi":
+        try:
+            c = Isi()
+            if zipfile.is_zipfile(zip_file):
+                with zipfile.ZipFile(zip_file, 'r') as z:
+                    for fichiers in z.namelist():
+                        fichier = z.open(fichiers, 'r')
+                        c.parse(fichier, bdd='isi')
+                        c.add(project=project, corpus=corpus, user=user)
+
+        except Exception as e:
+            print(e)
+    
+    elif source.database == "Ris":
+        try:
+            c = Isi()
+            if zipfile.is_zipfile(zip_file):
+                with zipfile.ZipFile(zip_file, 'r') as z:
+                    for fichiers in z.namelist():
+                        fichier = z.open(fichiers, 'r')
+                        c.parse(fichier)
+                        c.ajouter(project=project, corpus=corpus, user=user)
+
+        except Exception as e:
+            print(e)
+
+    elif source.database == "Pubmed":
+        try:
+            c = Pubmed()
+            if zipfile.is_zipfile(zip_file):
+                with zipfile.ZipFile(zip_file, 'r') as z:
+                    for fichiers in z.namelist():
+                        fichier = z.open(fichiers, 'r')
+                        c.parse(fichier)
+                        c.ajouter(project=project, corpus=corpus, user=user)
+
+        except Exception as e:
+            print(e)
+    else:
        pass


--- a/sources/isi.py
+++ b/sources/isi.py
+import os, sys
+#reload(sys)
+import re
+import locale
+from datetime import datetime, date
+from dateutil import parser
+
+#sys.path.append("../../gargantext/")
+#from .corpus import Corpus
+from documents.models import Document
+
+#TODO:
+# use separators in parameters
+
+class Isi() :
+    """
+    Thomson ISI parser
+    """
+    def __init__(self) :
+        """
+        See Corpus class which declare what a corpus is
+        """
+        # Specific declarations for Europresse
+        self.data       = []
+        self.object_ids = []
+
+    def read_param(self,file) :
+        """
+        The file is an init file paramters.
+        The function returns a dict of parameters for the following parse function.
+        """
+        source = open(file,'r')
+        lines = source.readlines()
+        tags={}
+        for line in lines:
+            if line[0] != '#':
+                tag = line.split('\t')
+                tags[tag[1]] = [tag[0], tag[2]]
+        return tags
+
+    def rules(self, parameters) :
+        """
+        Interpret and does the rules described in parameters.init of each field.
+        """
+        pass
+
+    def parse(self, source, bdd='isi') :
+        """
+        The dict needed is parameters, results of read_param function.
+        The file needed is the file to be parsed in raw text only.
+        """
+        #source = open(file, 'r')
+        lines = source.readlines()
+        document = {}
+        if bdd == 'isi':
+            parameters = self.read_param('sources/parameters/isi.init')
+        elif bdd == 'ris':
+            parameters = self.read_param('sources/parameters/ris.init')
+
+
+        for key in list(parameters.keys()):
+            if parameters[key][0] == 'BEGIN' :
+                begin = str(key)
+                del parameters[begin]
+            
+            elif parameters[key][0] == 'END' :
+                end = str(key)
+                del parameters[end]
+        
+        for line in lines :
+            if document == {} and line[:2] == begin :
+                document['url'] = " "
+                key             = ""
+                result          = ""
+
+            elif line[:2] in parameters.keys() :
+                
+                if key != "" and key != line[:2]:
+                    try:
+                        document[parameters[key][0]] = result
+                    except Exception as e: print(e)
+                    #document.setdefault(parameters[key][0],[]).append(result)
+                
+                key = line[:2]
+                result = line[2:].strip()
+                
+            elif line[:2] == '  ' :
+                try:
+                    result = result + ' ' + line[2:].strip()#.split(";")
+                    
+                except Exception as error :
+                    pass
+            
+            elif line[:2] == end :
+                document[parameters[key][0]] = result
+                
+                try:
+                    try: 
+                        date = document['year'] + " " + document['month']
+                        document['date'] = parser.parse(date)
+                    except:
+                        date = document['year']
+                        document['date'] = datetime.strptime(date, '%Y')
+
+                except Exception as e: print('88', e)
+                self.data.append(document)
+                document = {}
+
+    def add(self, project=None, corpus=None, user=None):
+        """ Appends notices to self.corpus from self.data removing duplicates"""
+        for i in self.data:
+            if i['uniqu_id'] not in self.object_ids and isinstance(i['date'], datetime):
+                self.object_ids.append(i['uniqu_id'])
+                doc = Document()
+                
+                doc.project = project
+                doc.user    = user
+
+                doc.date    = i['date']
+                doc.uniqu_id= i['uniqu_id']
+                doc.title   = i['title']
+                print(doc.project)
+
+                doc.source  = i['source']
+                doc.authors = i['authors']
+                doc.text    = i['text']
+
+                doc.save()
+                doc.corpus.add(corpus)
+
+        self.data = []
+
+
+
+def demo():
+    import sys
+    data = Isi()
+    data.add(parameters=param, file=sys.argv[1])
+
+if __name__ == "__main__" :
+    try:
+        demo()
+    except Exception as error :
+        print(sys.exc_traceback.tb_lineno, error)
--- a/sources/parameters/isi.init
+++ b/sources/parameters/isi.init
+##############################################################################
+# LEGEND:
+# NAME (what you want[1])	FIELD (see your data)	SEPARATORS (see your data)
+# 
+# [1] 
+# Be careful to these names variables which do not have to change:
+# BEGIN, ID-unique, END
+##############################################################################
+BEGIN	PT	""
+authors	AU	\n
+AF	AF	"\n"
+title	TI	""
+source	SO	"\n"
+language	LA	""
+DT	DT	""
+keywords	DE	;
+ID	ID	;
+text	AB	
+ISIC1	C1	\n
+reprint_author	RP	,
+email	EM	\n
+thanks	FX	
+CR	CR	\n
+number	NR	\n
+TC	TC	""
+Z9	Z9	""
+PU	PU	""
+PI	PI	""
+PA	PA	""
+SN	SN	""
+journal_small	J9	""
+JI	JI	""
+month	PD	""
+year	PY	""
+volume	VL	""
+IS	IS	""
+BP	BP	""
+EP	EP	""
+DOI	DI	""
+page	PG	""
+field	WC	""
+SC	SC	""
+GA	GA	""
+object_id	UT	""
+END	ER	""
--- a/sources/parameters/pubmed.init
+++ b/sources/parameters/pubmed.init
+tag_begin	PMID- 
+tag_end	$ligne$
+longueur_tag	6
+condition_debut_tag	\S\S\s
+AU  - 	ISIAUTHOR	$ligne$
+TI  - 	ISITITLE
+PT  - 	ISIDT
+MH  - 	ISIkeyword	;
+FAU - 	ISIAF	$ligne$
+TA  - 	ISIJOURNAL
+JT  - 	ISIJOURNALFull
+RN  - 	ISIID	;
+AB  - 	ISIABSTRACT
+AD  - 	ISIC1	$ligne$	***,	[***]
+AD  - 	ISIRP	,
+AD  - 	ISIFU	;
+nope- 	ISICR	$ligne$	,
+SO  - 	ISITC
+JT  - 	ISISO
+DA  - 	ISIpubdate
+VI  - 	ISIVolume
+PG  - 	ISIPage
+MH  - 	ISISC	;
+PMID- 	ISIUT
+PMID- 	ISIDOI
--- a/sources/pubmed.py
+++ b/sources/pubmed.py
+#!/usr/bin/env python
+# *coding:Utf8*
+""" 
+Pubmed Database parser
+__author__ : http://alexandre.delanoe.org
+__licence__ : GPL version 3.0+
+__DATE__ : 2014
+__VERSION__ : 0.1
+"""
+
+import datetime
+import sys, string, codecs
+from lxml import etree
+
+from documents.models import Document
+
+class Pubmed() :
+    """
+    Pubmed, Medline corpus parser
+    """
+    def __init__(self) :
+        """
+        See Corpus class which declares what a corpus is
+        """
+        Corpus.__init__(self)
+        self.bdd = "Medline"
+
+    
+#    class Article(Text):
+#        def __init__(self) :
+#            Text.__init__(self)
+
+    
+    def parse(self, file, bdd="PUBMED") :
+        """
+        The file needed is the file to be parsed in xml format.
+        The bdd is the filed of BDD-SOURCE.
+        """
+        document = {}
+        source = open(file, 'r')
+        
+        parser = etree.XMLParser(resolve_entities=False,recover=True)
+        xml = etree.parse(source, parser=parser)
+
+        xml_docs = xml.findall('PubmedArticle/MedlineCitation')
+
+        for xml_doc in xml_docs:
+            year = int(xml_doc.find('DateCreated/Year').text)
+            month = int(xml_doc.find('DateCreated/Month').text)
+            day = int(xml_doc.find('DateCreated/Day').text)
+            
+            self.Article.date = datetime.date(year, month, day)
+            self.Article.journal = xml_doc.find('Article/Journal/Title').text
+            self.Article.title = xml_doc.find('Article/ArticleTitle').text
+            self.texts.append(self.Article)
+
+#            if xmlDoc.find("PubmedArticle") is not None :
+#                print ok
+
+    def add(self, file):
+        self.parse(file)
+
+def demo(file):
+    data = Pubmed()
+    #data.parse(file='../data/pubmed/pubmed_result.xml')
+    data.parse(file)
+    print(data.texts[0])
+#    for i in data.keys():
+#        print i
+
+if __name__ == "__main__" :
+    try:
+        demo()
+    except Exception as error :
+        print(error)
+
+#
+#<PubmedArticle>
+#    <MedlineCitation Status="Publisher" Owner="NLM">
+#        <PMID Version="1">24363549</PMID>
+#        <DateCreated>
+#            <Year>2013</Year>
+#            <Month>12</Month>
+#            <Day>23</Day>
+#        </DateCreated>
+#        <Article PubModel="Print-Electronic">
+#            <Journal>
+#                <ISSN IssnType="Print">1080-7039</ISSN>
+#                <JournalIssue CitedMedium="Print">
+#                    <Volume>20</Volume>
+#                    <Issue>2</Issue>
+#                    <PubDate>
+#                        <Year>2014</Year>
+#                        <Month>Feb</Month>
+#                    </PubDate>
+#                </JournalIssue>
+#                <Title>Human and ecological risk assessment : HERA</Title>
+#                <ISOAbbreviation>Hum Ecol Risk Assess</ISOAbbreviation>
+#            </Journal>
+#            <ArticleTitle>A Causal Analysis of Observed Declines in Managed Honey Bees (Apis mellifera).</ArticleTitle>
+#            <Pagination>
+#                <MedlinePgn>566-591</MedlinePgn>
+#            </Pagination>
+#            <Abstract>
+#                <AbstractText NlmCategory="UNLABELLED">The European honey bee (Apis mellifera) is a highly valuable, semi-free-ranging managed agricultural species. While the number of managed hives has been increasing, declines in overwinter survival, and the onset of colony collapse disorder in 2006, precipitated a large amount of research on bees' health in an effort to isolate the causative factors. A workshop was convened during which bee experts were introduced to a formal causal analysis approach to compare 39 candidate causes against specified criteria to evaluate their relationship to the reduced overwinter survivability observed since 2006 of commercial bees used in the California almond industry. Candidate causes were categorized as probable, possible, or unlikely; several candidate causes were categorized as indeterminate due to lack of information. Due to time limitations, a full causal analysis was not completed at the workshop. In this article, examples are provided to illustrate the process and provide preliminary findings, using three candidate causes. Varroa mites plus viruses were judged to be a &quot;probable cause&quot; of the reduced survival, while nutrient deficiency was judged to be a &quot;possible cause.&quot; Neonicotinoid pesticides were judged to be &quot;unlikely&quot; as the sole cause of this reduced survival, although they could possibly be a contributing factor.</AbstractText>
+#            </Abstract>
+#            <AuthorList>
+#                <Author>
+#                    <LastName>Staveley</LastName>
+#                    <ForeName>Jane P</ForeName>
+#                    <Initials>JP</Initials>
+#                    <Affiliation>Exponent, Alexandria, VA, USA.</Affiliation>
+#                </Author>
+#                <Author>
+#                    <LastName>Law</LastName>
+#                    <ForeName>Sheryl A</ForeName>
+#                    <Initials>SA</Initials>
+#                    <Affiliation>Exponent, Alexandria, VA, USA.</Affiliation>
+#                </Author>
+#                <Author>
+#                    <LastName>Fairbrother</LastName>
+#                    <ForeName>Anne</ForeName>
+#                    <Initials>A</Initials>
+#                    <Affiliation>Exponent, Bellevue, WA, USA.</Affiliation>
+#                </Author>
+#                <Author>
+#                    <LastName>Menzie</LastName>
+#                    <ForeName>Charles A</ForeName>
+#                    <Initials>CA</Initials>
+#                    <Affiliation>Exponent, Alexandria, VA, USA.</Affiliation>
+#                </Author>
+#            </AuthorList>
+#            <Language>ENG</Language>
+#            <PublicationTypeList>
+#                <PublicationType>JOURNAL ARTICLE</PublicationType>
+#            </PublicationTypeList>
+#            <ArticleDate DateType="Electronic">
+#                <Year>2013</Year>
+#                <Month>11</Month>
+#                <Day>25</Day>
+#            </ArticleDate>
+#        </Article>
+#        <MedlineJournalInfo>
+#            <MedlineTA>Hum Ecol Risk Assess</MedlineTA>
+#            <NlmUniqueID>9513572</NlmUniqueID>
+#            <ISSNLinking>1080-7039</ISSNLinking>
+#        </MedlineJournalInfo>
+#        <KeywordList Owner="NOTNLM">
+#            <Keyword MajorTopicYN="N">Varroa</Keyword>
+#            <Keyword MajorTopicYN="N">causal analysis</Keyword>
+#            <Keyword MajorTopicYN="N">honey bees</Keyword>
+#            <Keyword MajorTopicYN="N">neonicotinoids</Keyword>
+#        </KeywordList>
+#    </MedlineCitation>
+#    <PubmedData>
+#        <History>
+#            <PubMedPubDate PubStatus="received">
+#                <Year>2013</Year>
+#                <Month>7</Month>
+#                <Day>8</Day>
+#            </PubMedPubDate>
+#            <PubMedPubDate PubStatus="accepted">
+#                <Year>2013</Year>
+#                <Month>7</Month>
+#                <Day>23</Day>
+#            </PubMedPubDate>
+#            <PubMedPubDate PubStatus="epublish">
+#                <Year>2013</Year>
+#                <Month>11</Month>
+#                <Day>25</Day>
+#            </PubMedPubDate>
+#            <PubMedPubDate PubStatus="entrez">
+#                <Year>2013</Year>
+#                <Month>12</Month>
+#                <Day>24</Day>
+#                <Hour>6</Hour>
+#                <Minute>0</Minute>
+#            </PubMedPubDate>
+#            <PubMedPubDate PubStatus="pubmed">
+#                <Year>2013</Year>
+#                <Month>12</Month>
+#                <Day>24</Day>
+#                <Hour>6</Hour>
+#                <Minute>0</Minute>
+#            </PubMedPubDate>
+#            <PubMedPubDate PubStatus="medline">
+#                <Year>2013</Year>
+#                <Month>12</Month>
+#                <Day>24</Day>
+#                <Hour>6</Hour>
+#                <Minute>0</Minute>
+#            </PubMedPubDate>
+#        </History>
+#        <PublicationStatus>ppublish</PublicationStatus>
+#        <ArticleIdList>
+#            <ArticleId IdType="doi">10.1080/10807039.2013.831263</ArticleId>
+#            <ArticleId IdType="pubmed">24363549</ArticleId>
+#            <ArticleId IdType="pmc">PMC3869053</ArticleId>
+#        </ArticleIdList>
+#        <?pmcsd?>
+#    </PubmedData>
+#</PubmedArticle>
+#
+