Commit 740c4055 authored by Administrator's avatar Administrator

Merge branch 'mat'

Intégration éléments Mathieu
parents 555a6ec8 86bbf12a
__pycache__/
parsing/Taggers/treetagger/
This diff is collapsed.
{
"metadata": {
"name": "",
"signature": "sha256:e0c3b2efe7c205a29dc4e028b10ffb7b9d0569f35c4b426febdf523069abffdb"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pprint import pprint\n",
"from node.models import Node, NodeType, Language, Ngram\n",
"from django.contrib.auth.models import User\n",
"import parsing\n",
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Define user\n",
"try:\n",
" user = User.objects.get(username='Mat')\n",
"except:\n",
" user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
" user.save()\n",
"\n",
"# Define document types\n",
"nodetypes = {}\n",
"for name in ['Corpus', 'Document']:\n",
" try:\n",
" nodetypes[name] = NodeType.objects.get(name=name)\n",
" except:\n",
" nodetypes[name] = NodeType(name=name)\n",
" nodetypes[name].save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()\n",
"corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
"corpus.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser.parse(corpus)\n",
"print('Ok!')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
}
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for node_ngram in corpus.children.first().node_ngram_set.all():\n",
" print(node_ngram.ngram.terms)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
\ No newline at end of file
{
"metadata": {
"name": "",
"signature": "sha256:cabaff3edb8995fecf78ead33fd8af0b9ada1fe75811cb60200317c70ea3079e"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pycountry\n",
"from node.models import Language"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pycountry\n",
"\n",
"for language in pycountry.languages:\n",
" try:\n",
" implemented = 1 if language.alpha2 in ['en', 'fr'] else 0\n",
" Language(iso2=language.alpha2, iso3=language.terminology, fullname=language.name, implemented=implemented).save()\n",
" except:\n",
" pass\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for language in Language.objects.all():\n",
" print(language)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Afar\n",
"Abkhazian\n",
"Afrikaans\n",
"Akan\n",
"Albanian\n",
"Amharic\n",
"Arabic\n",
"Aragonese\n",
"Armenian\n",
"Assamese\n",
"Avaric\n",
"Avestan\n",
"Aymara\n",
"Azerbaijani\n",
"Bashkir\n",
"Bambara\n",
"Basque\n",
"Belarusian\n",
"Bengali\n",
"Bihari languages\n",
"Bislama\n",
"Bosnian\n",
"Breton\n",
"Bulgarian\n",
"Burmese\n",
"Catalan; Valencian\n",
"Chamorro\n",
"Chechen\n",
"Chinese\n",
"Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic\n",
"Chuvash\n",
"Cornish\n",
"Corsican\n",
"Cree\n",
"Czech\n",
"Danish\n",
"Divehi; Dhivehi; Maldivian\n",
"Dutch; Flemish\n",
"Dzongkha\n",
"English\n",
"Esperanto\n",
"Estonian\n",
"Ewe\n",
"Faroese\n",
"Fijian\n",
"Finnish\n",
"French\n",
"Western Frisian\n",
"Fulah\n",
"Georgian\n",
"German\n",
"Gaelic; Scottish Gaelic\n",
"Irish\n",
"Galician\n",
"Manx\n",
"Greek, Modern (1453-)\n",
"Guarani\n",
"Gujarati\n",
"Haitian; Haitian Creole\n",
"Hausa\n",
"Hebrew\n",
"Herero\n",
"Hindi\n",
"Hiri Motu\n",
"Croatian\n",
"Hungarian\n",
"Igbo\n",
"Icelandic\n",
"Ido\n",
"Sichuan Yi; Nuosu\n",
"Inuktitut\n",
"Interlingue; Occidental\n",
"Interlingua (International Auxiliary Language Association)\n",
"Indonesian\n",
"Inupiaq\n",
"Italian\n",
"Javanese\n",
"Japanese\n",
"Kalaallisut; Greenlandic\n",
"Kannada\n",
"Kashmiri\n",
"Kanuri\n",
"Kazakh\n",
"Central Khmer\n",
"Kikuyu; Gikuyu\n",
"Kinyarwanda\n",
"Kirghiz; Kyrgyz\n",
"Komi\n",
"Kongo\n",
"Korean\n",
"Kuanyama; Kwanyama\n",
"Kurdish\n",
"Lao\n",
"Latin\n",
"Latvian\n",
"Limburgan; Limburger; Limburgish\n",
"Lingala\n",
"Lithuanian\n",
"Luxembourgish; Letzeburgesch\n",
"Luba-Katanga\n",
"Ganda\n",
"Macedonian\n",
"Marshallese\n",
"Malayalam\n",
"Maori\n",
"Marathi\n",
"Malay\n",
"Malagasy\n",
"Maltese\n",
"Moldavian; Moldovan\n",
"Mongolian\n",
"Nauru\n",
"Navajo; Navaho\n",
"Ndebele, South; South Ndebele\n",
"Ndebele, North; North Ndebele\n",
"Ndonga\n",
"Nepali\n",
"Norwegian Nynorsk; Nynorsk, Norwegian\n",
"Bokm\u00e5l, Norwegian; Norwegian Bokm\u00e5l\n",
"Norwegian\n",
"Chichewa; Chewa; Nyanja\n",
"Occitan (post 1500)\n",
"Ojibwa\n",
"Oriya\n",
"Oromo\n",
"Ossetian; Ossetic\n",
"Panjabi; Punjabi\n",
"Persian\n",
"Pali\n",
"Polish\n",
"Portuguese\n",
"Pushto; Pashto\n",
"Quechua\n",
"Romansh\n",
"Romanian\n",
"Rundi\n",
"Russian\n",
"Sango\n",
"Sanskrit\n",
"Sinhala; Sinhalese\n",
"Slovak\n",
"Slovenian\n",
"Northern Sami\n",
"Samoan\n",
"Shona\n",
"Sindhi\n",
"Somali\n",
"Sotho, Southern\n",
"Spanish; Castilian\n",
"Sardinian\n",
"Serbian\n",
"Swati\n",
"Sundanese\n",
"Swahili\n",
"Swedish\n",
"Tahitian\n",
"Tamil\n",
"Tatar\n",
"Telugu\n",
"Tajik\n",
"Tagalog\n",
"Thai\n",
"Tibetan\n",
"Tigrinya\n",
"Tonga (Tonga Islands)\n",
"Tswana\n",
"Tsonga\n",
"Turkmen\n",
"Turkish\n",
"Twi\n",
"Uighur; Uyghur\n",
"Ukrainian\n",
"Urdu\n",
"Uzbek\n",
"Venda\n",
"Vietnamese\n",
"Volap\u00fck\n",
"Welsh\n",
"Walloon\n",
"Wolof\n",
"Xhosa\n",
"Yiddish\n",
"Yoruba\n",
"Zhuang; Chuang\n",
"Zulu\n"
]
}
],
"prompt_number": 11
}
],
"metadata": {}
}
]
}
\ No newline at end of file
This diff is collapsed.
......@@ -55,7 +55,8 @@ python manage.py syncdb
Start the Python Notebook server
--------------------------------
1) In Pyvenv: python manage.py shell_plus --notebook
1) In Pyvenv:
python manage.py shell_plus --notebook
2) Work from your browser!
......@@ -63,4 +64,5 @@ Start the Python Notebook server
Start the Django server
-----------------------
In Pyvenv:
python manage.py runserver
\ No newline at end of file
#import FileParser
#
#class EuropressFileParser(FileParser, contents):
#
# def parse():
# pass
#
from parsing.FileParsers.FileParser import FileParser
class EuropressFileParser(FileParser):
def parse():
pass
import collections
from node.models import Node, NodeType, Language, Ngram, Node_Ngram
from parsing.NgramsExtractors import *
import collections
import dateutil.parser
class NgramCache:
"""
This allows the fast retrieval of ngram ids
......@@ -48,6 +51,7 @@ class FileParser:
self._extractors = dict()
self._document_nodetype = NodeType.objects.get(name='Document')
languages = Language.objects.all()
self._languages_fullname = {language.fullname.lower(): language for language in languages}
self._languages_iso2 = {language.iso2.lower(): language for language in languages}
self._languages_iso3 = {language.iso3.lower(): language for language in languages}
#self.parse()
......@@ -85,6 +89,7 @@ class FileParser:
"""Add a document to the database.
"""
def create_document(self, parentNode, title, contents, language, metadata, guid=None):
metadata = self.format_metadata(metadata)
# create or retrieve a resource for that document, based on its user id
# if guid is None:
# resource = Resource(guid=guid)
......@@ -137,3 +142,51 @@ class FileParser:
def parse(self):
return list()
def format_metadata_dates(self, metadata):
"""Format the dates found in the metadata.
Example: {"publication_date": "2014-10-23 09:57:42"} -> {...}
"""
# First, check the split dates...
prefixes = [key[:-5] for key in metadata.keys() if key[-5:] == "_year"]
for prefix in prefixes:
date_string = metadata[prefix + "_year"]
key = prefix + "_month"
if key in metadata:
date_string += " " + metadata[key]
key = prefix + "_day"
if key in metadata:
date_string += " " + metadata[key]
key = prefix + "_hour"
if key in metadata:
date_string += " " + metadata[key]
key = prefix + "_minute"
if key in metadata:
date_string += ":" + metadata[key]
key = prefix + "_second"
if key in metadata:
date_string += ":" + metadata[key]
try:
metadata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except:
pass
# ...then parse all the "date" fields, to parse it into separate elements
prefixes = [key[:-5] for key in metadata.keys() if key[-5:] == "_date"]
for prefix in prefixes:
date = dateutil.parser.parse(metadata[prefix + "_date"])
metadata[prefix + "_year"] = date.strftime("%Y")
metadata[prefix + "_month"] = date.strftime("%m")
metadata[prefix + "_day"] = date.strftime("%d")
metadata[prefix + "_hour"] = date.strftime("%H")
metadata[prefix + "_minute"] = date.strftime("%M")
metadata[prefix + "_second"] = date.strftime("%S")
# finally, return the result!
return metadata
def format_metadata(self, metadata):
"""Format the metadata."""
metadata = self.format_metadata_dates(metadata)
return metadata
\ No newline at end of file
from django.db import transaction
from FileParser import FileParser
from parsing.FileParsers.RisFileParser import RisFileParser
class IsiFileParser(FileParser):
class IsiFileParser(RisFileParser):
def parse(self, parentNode):
# read the file, line by line
for line in self.__file:
# open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
xml = etree.parse(self._file, parser=xml_parser)
# parse all the articles, one by one
# all database operations should be performed within one transaction
xml_articles = xml.findall('PubmedArticle')
with transaction.atomic():
for xml_article in xml_articles:
# extract data from the document
date_year = int(xml_article.find('MedlineCitation/DateCreated/Year').text)
date_month = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
date_day = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
metadata = {
# other metadata should also be included:
# authors, submission date, etc.
"date_pub": datetime.date(year, month, day),
"journal": xml_article.find('MedlineCitation/Article/Journal/Title').text
"title": xml_article.find('MedlineCitation/Article/ArticleTitle').text
"language_iso3": xml_article.find('MedlineCitation/Article/Language').text
"doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]').text
}
contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text
# create the document in the database
yield self.create_document(
parentNode = parentNode
title = metadata["title"],
contents = contents,
language = self._languages_iso3[metadata["language"].lower()]
metadata = metadata,
guid = metadata["doi"],
)
_parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "metadata", "key": "title", "separator": " "},
b"AU": {"type": "metadata", "key": "authors", "separator": ", "},
b"DI": {"type": "metadata", "key": "doi"},
b"PY": {"type": "metadata", "key": "publication_year"},
b"PD": {"type": "metadata", "key": "publication_month"},
b"LA": {"type": "metadata", "key": "language"},
b"AB": {"type": "metadata", "key": "abstract", "separator": " "},
b"WC": {"type": "metadata", "key": "fields"},
}
......@@ -7,7 +7,7 @@ import datetime
class PubmedFileParser(FileParser):
def parse(self, parentNode, tag=True):
def parse(self, parentNode=None, tag=True):
# open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
documents = []
......@@ -16,7 +16,6 @@ class PubmedFileParser(FileParser):
with zipfile.ZipFile(self._file) as zipFile:
for filename in zipFile.namelist():
file = zipFile.open(filename, "r")
# print(file.read())
xml = etree.parse(file, parser=xml_parser)
# parse all the articles, one by one
......@@ -24,19 +23,17 @@ class PubmedFileParser(FileParser):
xml_articles = xml.findall('PubmedArticle')
for xml_article in xml_articles:
# extract data from the document
date_year = int(xml_article.find('MedlineCitation/DateCreated/Year').text)
date_month = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
date_day = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
metadata = {
"date_pub": '%s-%s-%s' % (date_year, date_month, date_day),
}
metadata = {}
metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
"language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[type=doi]',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText'
}
"journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
"language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[type=doi]',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"publication_year" : 'MedlineCitation/DateCreated/Year',
"publication_month" : 'MedlineCitation/DateCreated/Month',
"publication_day" : 'MedlineCitation/DateCreated/Day',
}
for key, path in metadata_path.items():
try:
node = xml_article.find(path)
......
#from parsing.FileParsers import EuropressFileParser
from parsing.FileParsers import PubmedFileParser
from parsing.FileParsers.IsiFileParser import IsiFileParser
from parsing.FileParsers.PubmedFileParser import PubmedFileParser
from parsing.FileParsers.EuropressFileParser import EuropressFileParser
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment