Commit 99f2e866 authored by delanoe's avatar delanoe

[CLEAN] removing old directory.

parent cf3c1b70
from django.contrib import admin
# Register your models here.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Europresse Database parser for HTML sources only.
This script is using 3 methods of parsing:
1) REGEX (Regular Expressions) format detection
2) SAX (Simple Api for Xml) like method for events detection
3) DOM (Document Object Model), operating on the document as a whole for
tree detection.
Bug reports? Please contact the author:
__author__ : alexandre+gargantext @ delanoe.org
__licence__ : GPL version 3.0+
__DATE__ : 09 november 2013
__VERSION__ : 2.0
"""
import os
import sys
import imp
imp.reload(sys)
import re
import locale
from datetime import datetime, date
from lxml import etree
from documents.models import Document
#from .corpus import Corpus
class Europresse():
"""
1) First build tree to parse data
2) Then each notice (article) is nested in a dictionary,
3) Finaly, corpus is a list of articles as dictionnaries.
"""
def __init__(self):
"""self.corpus is a list
articles is the list of articles in the HTML page
article is an article as dict"""
# Specific declarations for Europresse
self.data = []
# Encoding
self.codif = "UTF-8"
self.localeEncoding = "fr_FR"
def test_unicode(self, filename):
import os
os.system("file_europresse=$(mktemp -q); file --mime-encoding \'%s\' | grep -i -- \"iso-8859\" && \
iconv -f latin1 -t utf8 \'%s\' > $file_europresse && \
mv $file_europresse \'%s\'" % (filename, filename, filename))
def parse(self, filename):
"""Adding filename to self.data after parsing"""
count = 0
articles = []
article = {}
parser = etree.HTMLParser(encoding=self.codif)
tree = etree.parse(filename, parser)
articles = tree.xpath('/html/body/table')
for notice in articles:
if len(notice):
for name in notice.xpath("./tr/td/span[@class = 'DocPublicationName']"):
if name.text is not None:
format_journal = re.compile('(.*), (.*)', re.UNICODE)
test_journal = format_journal.match(name.text)
if test_journal is not None:
article['source'] = test_journal.group(1)
article['volume'] = test_journal.group(2)
else:
article['source'] = name.text.encode(self.codif)
for header in notice.xpath("./tr/td/span[@class = 'DocHeader']"):
text = header.text
if isinstance(text, bytes):
text = text.decode()
format_date_fr = re.compile('\d+\s*\w+\s+\d{4}', re.UNICODE)
test_date_fr = format_date_fr.match(text)
format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
test_date_en = format_date_en.match(text)
format_sect = re.compile('(\D+),', re.UNICODE)
test_sect = format_sect.match(text)
format_page = re.compile(', p. (\w+)', re.UNICODE)
test_page = format_page.match(text)
if test_date_fr is not None:
self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, self.localeEncoding)
try :
article['date'] = datetime.strptime(text, '%d %B %Y')
except :
try:
article['date'] = datetime.strptime(text, '%B %Y')
except :
pass
if test_date_en is not None:
self.localeEncoding = "en_GB.UTF-8"
locale.setlocale(locale.LC_ALL, self.localeEncoding)
try :
article['date'] = datetime.strptime(text, '%B %d, %Y')
except :
try :
article['date'] = datetime.strptime(text, '%B %Y')
except :
pass
if test_sect is not None:
article['section'] = test_sect.group(1).encode(self.codif)
if test_page is not None:
article['page'] = test_page.group(1).encode(self.codif)
article['title'] = notice.xpath("string(./tr/td/span[@class = 'TitreArticleVisu'])").encode(self.codif)
article['text'] = notice.xpath("./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()")
line = 0
br_tag = 10
for i in articles[count].iter():
# print line, br, i, i.tag, i.attrib, i.tail
if i.tag == "span":
if "class" in i.attrib:
if i.attrib['class'] == 'TitreArticleVisu':
line = 1
br_tag = 2
if line == 1 and i.tag == "br":
br_tag -= 1
if line == 1 and br_tag == 0:
try:
article['authors'] = str.title(etree.tostring(i, method="text", encoding=self.codif)).encode(self.codif)#.split(';')
#article['authors'] = tuple(article['authors'])
except:
article['authors'] = 'not found'
line = 0
br_tag = 10
try:
if article['date'] is not None or article['date'] != '':
try:
back = article['date']
except Exception as e:
print(e)
pass
else:
try:
article['date'] = back
except Exception as e:
print(e)
except :
article['date'] = datetime.now()
article['uniqu_id'] = article['text'][-9]
article['text'].pop()
article['text'] = ' '.join(article['text'])
article['text'] = re.sub('Tous droits réservés.*$', '', article['text'])
article['bdd'] = 'europresse'
article['url'] = ''
self.data.append(article)
article = {'source': "", 'volume': "", 'date': "", \
'authors': "", 'section': "", 'page':"", 'text': "", 'object_id':""}
count += 1
def add(self, project=None, corpus=None, user=None, ids=None):
""" Appends notices to self.corpus from self.data removing duplicates"""
if ids is not None:
self.object_ids = ids
else:
self.object_ids = set()
for i in self.data:
if i['uniqu_id'] not in self.object_ids and isinstance(i['date'], datetime):
self.object_ids.add(i['uniqu_id'])
doc = Document()
doc.project = project
doc.user = user
doc.date = i['date']
doc.uniqu_id= i['uniqu_id']
doc.title = i['title']
doc.source = i['source']
doc.authors = i['authors']
doc.text = i['text']
doc.save()
doc.corpus.add(corpus)
self.data = []
def demo():
import sys
data = Europresse()
try:
pass
except Exception as e:
print("very usefull function", e)
print(a['date'])
if __name__ == "__main__" :
try:
demo()
except Exception as error:
print(error)
# import Celery here
from documents.models import Document
from sources.europresse import Europresse
from sources.isi import Isi
from sources.pubmed import Pubmed
import zipfile
def importer(source, language, zip_file, project=None, corpus=None, user=None):
ids = set([ doc.uniqu_id for doc in Document.objects.filter(corpus=corpus)])
if source.database == "Europresse":
try:
print("Europresse DB detected")
c = Europresse()
if zipfile.is_zipfile(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
for fichiers in z.namelist():
fichier = z.open(fichiers, 'r')
c.parse(fichier)
c.add(project=project, corpus=corpus, user=user, ids=ids)
fichier.close()
z.close()
except Exception as e:
print(e)
elif source.database == "Web of Science (ISI format)":
try:
print("ISI DB detected")
c = Isi()
if zipfile.is_zipfile(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
for fichiers in z.namelist():
print("parsing %s" % (fichiers))
fichier = z.open(fichiers, 'r')
c.parse(fichier, bdd='isi')
c.add(project=project, corpus=corpus, user=user, ids=ids)
fichier.close()
z.close()
except Exception as e:
print(e)
elif source.database == "RIS (Zotero)":
try:
print("RIS DB detected")
c = Isi()
if zipfile.is_zipfile(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
for fichiers in z.namelist():
fichier = z.open(fichiers, 'r')
c.parse(fichier, bdd='ris')
c.add(project=project, corpus=corpus, user=user, ids=ids)
z.close()
except Exception as e:
print(e)
elif source.database == "Pubmed":
try:
print("PubMed DB detected")
c = Pubmed()
if zipfile.is_zipfile(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
for fichiers in z.namelist():
fichier = z.open(fichiers, 'r')
c.parse(fichier)
c.ajouter(project=project, corpus=corpus, user=user, ids=ids)
z.close()
except Exception as e:
print(e)
else:
print("Corpus not detected")
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
ISI parser.
__author__ : alexandre+gargantext @ delanoe.org
__licence__ : GPL version 3.0+
__DATE__ : 2014
__VERSION__ : 1.0
"""
import os, sys
#reload(sys)
import re
import locale
# import hashlib ?
from datetime import datetime, date
from dateutil import parser
from documents.models import Document
#TODO:
# use separators in parameters
class Isi() :
"""
Thomson ISI parser
"""
def __init__(self) :
"""
See Corpus class which declare what a corpus is
"""
# Specific declarations for Europresse
self.data = []
def read_param(self,file) :
"""
The file is an init file paramters.
The function returns a dict of parameters for the following parse function.
"""
source = open(file,'r')
lines = source.readlines()
tags={}
for line in lines:
if line[0] != '#':
tag = line.split('\t')
tags[str(tag[1])] = [str(tag[0]), str(tag[2])]
return tags
source.close()
def rules(self, parameters) :
"""
Interpret and does the rules described in parameters.init of each field.
"""
pass
def parse(self, source, bdd='isi') :
"""
The dict needed is parameters, results of read_param function.
The file needed is the file to be parsed in raw text only.
"""
lines = source.readlines()
doc = {}
if bdd == 'isi':
try:
print("reading parameters ISI")
parameters = self.read_param('sources/parameters/isi.init')
except Exception as e: print(e)
elif bdd == 'ris':
try:
print("reading parameters RIS")
parameters = self.read_param('sources/parameters/ris.init')
except Exception as e: print(e)
for key in list(parameters.keys()):
if parameters[key][0] == 'BEGIN' :
begin = str(key)
del parameters[begin]
elif parameters[key][0] == 'END' :
end = str(key)
del parameters[end]
for line in lines :
line = str(line, encoding='UTF-8')
if bdd == 'ris':
line = line.replace(' - ', '')
if doc == {} and line[:2] == begin :
#print(line)
doc['url'] = " "
key = ""
result = ""
elif line[:2] in parameters.keys() :
if key != "" and key != line[:2]:
try:
doc[parameters[key][0]] = result
except Exception as e: print(e)
#doc.setdefault(parameters[key][0],[]).append(result)
key = line[:2]
result = line[2:].strip()
elif line[:2] == ' ' :
try:
result = result + ' ' + line[2:].strip()#.split(";")
except Exception as error :
print(error)
elif line[:2] == end :
doc[parameters[key][0]] = result
try:
try:
date = doc['year'] + " " + doc['month']
doc['date'] = parser.parse(date)
except:
date = doc['year']
doc['date'] = datetime.strptime(date, '%Y')
except Exception as e:
print('88', e)
try:
print(doc['year'])
except Exception as e: print('58',e)
self.data.append(doc)
doc = {}
def add(self, project=None, corpus=None, user=None, ids=None):
""" Appends notices to self.corpus from self.data removing duplicates"""
if ids is not None:
self.object_ids = ids
else:
self.object_ids = set()
for i in self.data:
if 'uniqu_id' not in i.keys():
#crypt = md5.new()
#crypt.update(i['title'])
#i['uniqu_id'] = crypt.digest()
i['uniqu_id'] = i['title'] + i['date']
if i['uniqu_id'] not in self.object_ids and isinstance(i['date'], datetime):
self.object_ids.add(i['uniqu_id'])
doc = Document()
try:
doc.project = project
except Exception as e: print(e)
try:
doc.user = user
except Exception as e: print(e)
try:
doc.date = i['date']
except Exception as e: print(e)
try:
doc.uniqu_id= i['uniqu_id']
except Exception as e: print(e)
try:
doc.title = i['title']
except Exception as e: print(e)
try:
doc.source = i['source']
except Exception as e: print(e)
try:
doc.authors = i['authors']
except Exception as e: print(e)
try:
doc.abstract = i['abstract']
except Exception as e: print(e)
try:
doc.save()
except Exception as e: print(e)
doc.corpus.add(corpus)
self.data = []
def demo():
import sys
data = Isi()
data.add(parameters=param, file=sys.argv[1])
if __name__ == "__main__" :
try:
demo()
except Exception as error :
print(sys.exc_traceback.tb_lineno, error)
from django.db import models
# Create your models here.
##############################################################################
# LEGEND:
# NAME (what you want[1]) FIELD (see your data) SEPARATORS (see your data)
#
# [1]
# Be careful to these names variables which do not have to change:
# BEGIN, ID-unique, END
##############################################################################
BEGIN PT ""
authors AU \n
AF AF "\n"
title TI ""
source SO "\n"
language LA ""
DT DT ""
keywords DE ;
ID ID ;
abstract AB
ISIC1 C1 \n
reprint_author RP ,
email EM \n
thanks FX
CR CR \n
number NR \n
TC TC ""
Z9 Z9 ""
PU PU ""
PI PI ""
PA PA ""
SN SN ""
journal_small J9 ""
JI JI ""
month PD ""
year PY ""
volume VL ""
IS IS ""
BP BP ""
EP EP ""
DOI DI ""
page PG ""
field WC ""
SC SC ""
GA GA ""
uniqu_id UT ""
END ER ""
tag_begin PMID-
tag_end $ligne$
longueur_tag 6
condition_debut_tag \S\S\s
AU - ISIAUTHOR $ligne$
TI - ISITITLE
PT - ISIDT
MH - ISIkeyword ;
FAU - ISIAF $ligne$
TA - ISIJOURNAL
JT - ISIJOURNALFull
RN - ISIID ;
AB - ISIABSTRACT
AD - ISIC1 $ligne$ ***, [***]
AD - ISIRP ,
AD - ISIFU ;
nope- ISICR $ligne$ ,
SO - ISITC
JT - ISISO
DA - ISIpubdate
VI - ISIVolume
PG - ISIPage
MH - ISISC ;
PMID- ISIUT
PMID- ISIDOI
##############################################################################
# LEGEND:
# NAME (what you want[1]) FIELD (see your data) SEPARATORS (see your data)
#
# [1]
# Be careful to these names variables which do not have to change:
# BEGIN, ID-unique, END
##############################################################################
BEGIN TY ""
authors AU \n
AF AF "\n"
title TI ""
source SO "\n"
language LA ""
DT DT ""
keywords KW ;
ID ID ;
abstract AB
text ST ,
ISIC1 C1 \n
reprint_author RP ,
email EM \n
thanks FX
CR CR \n
number NR \n
TC TC ""
Z9 Z9 ""
PU PU ""
PI PI ""
PA PA ""
SN SN ""
journal_small J9 ""
JI JI ""
month PD ""
year PY ""
volume VL ""
IS IS ""
BP BP ""
EP EP ""
DOI DI ""
page PG ""
field WC ""
SC SC ""
GA GA ""
uniqu_id DO ""
END ER ""
#!/usr/bin/env python
# *coding:Utf8*
"""
Pubmed Database parser
__author__ : http://alexandre.delanoe.org
__licence__ : GPL version 3.0+
__DATE__ : 2014
__VERSION__ : 0.1
"""
import datetime
import sys, string, codecs
from lxml import etree
from documents.models import Document
class Pubmed() :
"""
Pubmed, Medline corpus parser
"""
def __init__(self) :
"""
See Corpus class which declares what a corpus is
"""
Corpus.__init__(self)
self.bdd = "Medline"
# class Article(Text):
# def __init__(self) :
# Text.__init__(self)
def parse(self, file, bdd="PUBMED") :
"""
The file needed is the file to be parsed in xml format.
The bdd is the filed of BDD-SOURCE.
"""
document = {}
source = open(file, 'r')
parser = etree.XMLParser(resolve_entities=False,recover=True)
xml = etree.parse(source, parser=parser)
xml_docs = xml.findall('PubmedArticle/MedlineCitation')
for xml_doc in xml_docs:
year = int(xml_doc.find('DateCreated/Year').text)
month = int(xml_doc.find('DateCreated/Month').text)
day = int(xml_doc.find('DateCreated/Day').text)
self.Article.date = datetime.date(year, month, day)
self.Article.journal = xml_doc.find('Article/Journal/Title').text
self.Article.title = xml_doc.find('Article/ArticleTitle').text
self.texts.append(self.Article)
# if xmlDoc.find("PubmedArticle") is not None :
# print ok
def add(self, file):
self.parse(file)
def demo(file):
data = Pubmed()
#data.parse(file='../data/pubmed/pubmed_result.xml')
data.parse(file)
print(data.texts[0])
# for i in data.keys():
# print i
if __name__ == "__main__" :
try:
demo()
except Exception as error :
print(error)
#
#<PubmedArticle>
# <MedlineCitation Status="Publisher" Owner="NLM">
# <PMID Version="1">24363549</PMID>
# <DateCreated>
# <Year>2013</Year>
# <Month>12</Month>
# <Day>23</Day>
# </DateCreated>
# <Article PubModel="Print-Electronic">
# <Journal>
# <ISSN IssnType="Print">1080-7039</ISSN>
# <JournalIssue CitedMedium="Print">
# <Volume>20</Volume>
# <Issue>2</Issue>
# <PubDate>
# <Year>2014</Year>
# <Month>Feb</Month>
# </PubDate>
# </JournalIssue>
# <Title>Human and ecological risk assessment : HERA</Title>
# <ISOAbbreviation>Hum Ecol Risk Assess</ISOAbbreviation>
# </Journal>
# <ArticleTitle>A Causal Analysis of Observed Declines in Managed Honey Bees (Apis mellifera).</ArticleTitle>
# <Pagination>
# <MedlinePgn>566-591</MedlinePgn>
# </Pagination>
# <Abstract>
# <AbstractText NlmCategory="UNLABELLED">The European honey bee (Apis mellifera) is a highly valuable, semi-free-ranging managed agricultural species. While the number of managed hives has been increasing, declines in overwinter survival, and the onset of colony collapse disorder in 2006, precipitated a large amount of research on bees' health in an effort to isolate the causative factors. A workshop was convened during which bee experts were introduced to a formal causal analysis approach to compare 39 candidate causes against specified criteria to evaluate their relationship to the reduced overwinter survivability observed since 2006 of commercial bees used in the California almond industry. Candidate causes were categorized as probable, possible, or unlikely; several candidate causes were categorized as indeterminate due to lack of information. Due to time limitations, a full causal analysis was not completed at the workshop. In this article, examples are provided to illustrate the process and provide preliminary findings, using three candidate causes. Varroa mites plus viruses were judged to be a &quot;probable cause&quot; of the reduced survival, while nutrient deficiency was judged to be a &quot;possible cause.&quot; Neonicotinoid pesticides were judged to be &quot;unlikely&quot; as the sole cause of this reduced survival, although they could possibly be a contributing factor.</AbstractText>
# </Abstract>
# <AuthorList>
# <Author>
# <LastName>Staveley</LastName>
# <ForeName>Jane P</ForeName>
# <Initials>JP</Initials>
# <Affiliation>Exponent, Alexandria, VA, USA.</Affiliation>
# </Author>
# <Author>
# <LastName>Law</LastName>
# <ForeName>Sheryl A</ForeName>
# <Initials>SA</Initials>
# <Affiliation>Exponent, Alexandria, VA, USA.</Affiliation>
# </Author>
# <Author>
# <LastName>Fairbrother</LastName>
# <ForeName>Anne</ForeName>
# <Initials>A</Initials>
# <Affiliation>Exponent, Bellevue, WA, USA.</Affiliation>
# </Author>
# <Author>
# <LastName>Menzie</LastName>
# <ForeName>Charles A</ForeName>
# <Initials>CA</Initials>
# <Affiliation>Exponent, Alexandria, VA, USA.</Affiliation>
# </Author>
# </AuthorList>
# <Language>ENG</Language>
# <PublicationTypeList>
# <PublicationType>JOURNAL ARTICLE</PublicationType>
# </PublicationTypeList>
# <ArticleDate DateType="Electronic">
# <Year>2013</Year>
# <Month>11</Month>
# <Day>25</Day>
# </ArticleDate>
# </Article>
# <MedlineJournalInfo>
# <MedlineTA>Hum Ecol Risk Assess</MedlineTA>
# <NlmUniqueID>9513572</NlmUniqueID>
# <ISSNLinking>1080-7039</ISSNLinking>
# </MedlineJournalInfo>
# <KeywordList Owner="NOTNLM">
# <Keyword MajorTopicYN="N">Varroa</Keyword>
# <Keyword MajorTopicYN="N">causal analysis</Keyword>
# <Keyword MajorTopicYN="N">honey bees</Keyword>
# <Keyword MajorTopicYN="N">neonicotinoids</Keyword>
# </KeywordList>
# </MedlineCitation>
# <PubmedData>
# <History>
# <PubMedPubDate PubStatus="received">
# <Year>2013</Year>
# <Month>7</Month>
# <Day>8</Day>
# </PubMedPubDate>
# <PubMedPubDate PubStatus="accepted">
# <Year>2013</Year>
# <Month>7</Month>
# <Day>23</Day>
# </PubMedPubDate>
# <PubMedPubDate PubStatus="epublish">
# <Year>2013</Year>
# <Month>11</Month>
# <Day>25</Day>
# </PubMedPubDate>
# <PubMedPubDate PubStatus="entrez">
# <Year>2013</Year>
# <Month>12</Month>
# <Day>24</Day>
# <Hour>6</Hour>
# <Minute>0</Minute>
# </PubMedPubDate>
# <PubMedPubDate PubStatus="pubmed">
# <Year>2013</Year>
# <Month>12</Month>
# <Day>24</Day>
# <Hour>6</Hour>
# <Minute>0</Minute>
# </PubMedPubDate>
# <PubMedPubDate PubStatus="medline">
# <Year>2013</Year>
# <Month>12</Month>
# <Day>24</Day>
# <Hour>6</Hour>
# <Minute>0</Minute>
# </PubMedPubDate>
# </History>
# <PublicationStatus>ppublish</PublicationStatus>
# <ArticleIdList>
# <ArticleId IdType="doi">10.1080/10807039.2013.831263</ArticleId>
# <ArticleId IdType="pubmed">24363549</ArticleId>
# <ArticleId IdType="pmc">PMC3869053</ArticleId>
# </ArticleIdList>
# <?pmcsd?>
# </PubmedData>
#</PubmedArticle>
#
from django.test import TestCase
# Create your tests here.
from django.shortcuts import render
# Create your views here.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment