Commit 0720146f authored by Mathieu Rodic's avatar Mathieu Rodic

[FEATURE] ISI parser - Almost working

parent 00f1333b
__pycache__/
parsing/Taggers/treetagger/
{
"metadata": {
"name": "",
"signature": "sha256:cabaff3edb8995fecf78ead33fd8af0b9ada1fe75811cb60200317c70ea3079e"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pycountry\n",
"from node.models import Language"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pycountry\n",
"\n",
"for language in pycountry.languages:\n",
" try:\n",
" implemented = 1 if language.alpha2 in ['en', 'fr'] else 0\n",
" Language(iso2=language.alpha2, iso3=language.terminology, fullname=language.name, implemented=implemented).save()\n",
" except:\n",
" pass\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for language in Language.objects.all():\n",
" print(language)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Afar\n",
"Abkhazian\n",
"Afrikaans\n",
"Akan\n",
"Albanian\n",
"Amharic\n",
"Arabic\n",
"Aragonese\n",
"Armenian\n",
"Assamese\n",
"Avaric\n",
"Avestan\n",
"Aymara\n",
"Azerbaijani\n",
"Bashkir\n",
"Bambara\n",
"Basque\n",
"Belarusian\n",
"Bengali\n",
"Bihari languages\n",
"Bislama\n",
"Bosnian\n",
"Breton\n",
"Bulgarian\n",
"Burmese\n",
"Catalan; Valencian\n",
"Chamorro\n",
"Chechen\n",
"Chinese\n",
"Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic\n",
"Chuvash\n",
"Cornish\n",
"Corsican\n",
"Cree\n",
"Czech\n",
"Danish\n",
"Divehi; Dhivehi; Maldivian\n",
"Dutch; Flemish\n",
"Dzongkha\n",
"English\n",
"Esperanto\n",
"Estonian\n",
"Ewe\n",
"Faroese\n",
"Fijian\n",
"Finnish\n",
"French\n",
"Western Frisian\n",
"Fulah\n",
"Georgian\n",
"German\n",
"Gaelic; Scottish Gaelic\n",
"Irish\n",
"Galician\n",
"Manx\n",
"Greek, Modern (1453-)\n",
"Guarani\n",
"Gujarati\n",
"Haitian; Haitian Creole\n",
"Hausa\n",
"Hebrew\n",
"Herero\n",
"Hindi\n",
"Hiri Motu\n",
"Croatian\n",
"Hungarian\n",
"Igbo\n",
"Icelandic\n",
"Ido\n",
"Sichuan Yi; Nuosu\n",
"Inuktitut\n",
"Interlingue; Occidental\n",
"Interlingua (International Auxiliary Language Association)\n",
"Indonesian\n",
"Inupiaq\n",
"Italian\n",
"Javanese\n",
"Japanese\n",
"Kalaallisut; Greenlandic\n",
"Kannada\n",
"Kashmiri\n",
"Kanuri\n",
"Kazakh\n",
"Central Khmer\n",
"Kikuyu; Gikuyu\n",
"Kinyarwanda\n",
"Kirghiz; Kyrgyz\n",
"Komi\n",
"Kongo\n",
"Korean\n",
"Kuanyama; Kwanyama\n",
"Kurdish\n",
"Lao\n",
"Latin\n",
"Latvian\n",
"Limburgan; Limburger; Limburgish\n",
"Lingala\n",
"Lithuanian\n",
"Luxembourgish; Letzeburgesch\n",
"Luba-Katanga\n",
"Ganda\n",
"Macedonian\n",
"Marshallese\n",
"Malayalam\n",
"Maori\n",
"Marathi\n",
"Malay\n",
"Malagasy\n",
"Maltese\n",
"Moldavian; Moldovan\n",
"Mongolian\n",
"Nauru\n",
"Navajo; Navaho\n",
"Ndebele, South; South Ndebele\n",
"Ndebele, North; North Ndebele\n",
"Ndonga\n",
"Nepali\n",
"Norwegian Nynorsk; Nynorsk, Norwegian\n",
"Bokm\u00e5l, Norwegian; Norwegian Bokm\u00e5l\n",
"Norwegian\n",
"Chichewa; Chewa; Nyanja\n",
"Occitan (post 1500)\n",
"Ojibwa\n",
"Oriya\n",
"Oromo\n",
"Ossetian; Ossetic\n",
"Panjabi; Punjabi\n",
"Persian\n",
"Pali\n",
"Polish\n",
"Portuguese\n",
"Pushto; Pashto\n",
"Quechua\n",
"Romansh\n",
"Romanian\n",
"Rundi\n",
"Russian\n",
"Sango\n",
"Sanskrit\n",
"Sinhala; Sinhalese\n",
"Slovak\n",
"Slovenian\n",
"Northern Sami\n",
"Samoan\n",
"Shona\n",
"Sindhi\n",
"Somali\n",
"Sotho, Southern\n",
"Spanish; Castilian\n",
"Sardinian\n",
"Serbian\n",
"Swati\n",
"Sundanese\n",
"Swahili\n",
"Swedish\n",
"Tahitian\n",
"Tamil\n",
"Tatar\n",
"Telugu\n",
"Tajik\n",
"Tagalog\n",
"Thai\n",
"Tibetan\n",
"Tigrinya\n",
"Tonga (Tonga Islands)\n",
"Tswana\n",
"Tsonga\n",
"Turkmen\n",
"Turkish\n",
"Twi\n",
"Uighur; Uyghur\n",
"Ukrainian\n",
"Urdu\n",
"Uzbek\n",
"Venda\n",
"Vietnamese\n",
"Volap\u00fck\n",
"Welsh\n",
"Walloon\n",
"Wolof\n",
"Xhosa\n",
"Yiddish\n",
"Yoruba\n",
"Zhuang; Chuang\n",
"Zulu\n"
]
}
],
"prompt_number": 11
}
],
"metadata": {}
}
]
}
\ No newline at end of file
{
"metadata": {
"name": "",
"signature": "sha256:634b0bc0fc552e28c568a3fa59f7567a562a69376a589c6f7c3960bcaf7a94e8"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"\"RE abcdefgh\\n\"[3:-1]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 2,
"text": [
"'abcdefgh'"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"parser = IsiFileParser(filepath='/home/mat/projects/gargantext/data_samples/isi.txt')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"parser.parse()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"{'concatenate': ', ', 'key': 'authors', 'type': 'metadata'}\n"
]
},
{
"ename": "TypeError",
"evalue": "sequence item 0: expected str instance, bytes found",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-4-785d3def061e>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m/home/mat/projects/gargantext/gargantext/parsing/FileParsers/IsiFileParser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(self, parentNode, tag)\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mparameter\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"type\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m\"metadata\"\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 24\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparameter\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 25\u001b[1;33m \u001b[0mmetadata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mparameter\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"key\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparameter\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"concatenate\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlast_values\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 26\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mparameter\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"type\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m\"delimiter\"\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmetadata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mTypeError\u001b[0m: sequence item 0: expected str instance, bytes found"
]
}
],
"prompt_number": 4
}
],
"metadata": {}
}
]
}
\ No newline at end of file
...@@ -5,28 +5,29 @@ from parsing.FileParsers.FileParser import FileParser ...@@ -5,28 +5,29 @@ from parsing.FileParsers.FileParser import FileParser
class IsiFileParser(FileParser): class IsiFileParser(FileParser):
_parameters = { _parameters = {
"ER": {"type": "delimiter"}, b"ER": {"type": "delimiter"},
"AU": {"type": "metadata", "key": "authors", "concatenate": False}, b"TI": {"type": "metadata", "key": "title", "concatenate": b" "},
"AB": {"type": "metadata", "key": "abstract", "concatenate": True}, b"AU": {"type": "metadata", "key": "authors", "concatenate": b", "},
b"AB": {"type": "metadata", "key": "abstract", "concatenate": b" "},
} }
def parse(self, parentNode=None, tag=True): def parse(self, parentNode=None, tag=True):
metadata = {} metadata = {}
last_key = None last_key = None
last_values = [] last_values = []
for line in self.file: for line in self._file:
if len(line) > 2: if len(line) > 2:
parameter_key = line[:2] parameter_key = line[:2]
if parameter_key != last_key: if parameter_key != b' ' and parameter_key != last_key:
if last_key is not None: if last_key in self._parameters:
parameter = self._parameters[last_key] parameter = self._parameters[last_key]
if parameter["type"] == "metadata": if parameter["type"] == "metadata":
metadata[parameter["key"]] = ' '.join(last_values) if parameter["concatenate"] else last_values metadata[parameter["key"]] = parameter["concatenate"].join(last_values)
elif parameter["type"] == "metadata": elif parameter["type"] == "delimiter":
print(metadata) print(metadata)
metadata = {} metadata = {}
break break
parameter = self._parameters[last_key]
last_key = parameter_key last_key = parameter_key
last_values = [] last_values = []
last_values.append(line[3:-1]) last_values.append(line[3:-1])
\ No newline at end of file self.file.close()
\ No newline at end of file
{
"metadata": {
"name": "",
"signature": "sha256:71dcc854ee670084dd2d3795a96e0faa7d3feb1f1958d41b08c32fe1a0d70be9"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pprint import pprint\n",
"from node.models import Node, NodeType, Language, Ngram\n",
"from django.contrib.auth.models import User\n",
"import parsing\n",
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Define user\n",
"try:\n",
" user = User.objects.get(username='Mat')\n",
"except:\n",
" user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
" user.save()\n",
"\n",
"# Define document types\n",
"nodetypes = {}\n",
"for name in ['Corpus', 'Document']:\n",
" try:\n",
" nodetypes[name] = NodeType.objects.get(name=name)\n",
" except:\n",
" nodetypes[name] = NodeType(name=name)\n",
" nodetypes[name].save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()\n",
"corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
"corpus.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser.parse(corpus)\n",
"print('Ok!')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Ok!"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for node_ngram in corpus.children.first().node_ngram_set.all():\n",
" print(node_ngram.ngram.terms)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"plant-pathogenic rna virus\n",
"significant source\n",
"result\n",
"host populations\n",
"in\n",
"arthropod hosts\n",
"unique example\n",
"spread\n",
"tobacco ringspot\n",
"colony survival\n",
"apis mellifera\n",
"other bee viruses"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"negative impact\n",
"threat\n",
"honeybees\n",
"varroa mites\n",
"intracellular life cycle\n",
"virus\n",
"conjunction\n",
"honeybee hosts\n",
"bee hemolymph\n",
"distinct lineage\n",
"transkingdom host alteration"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"monophyletic clade\n",
"prevalence\n",
"winter\n",
"pathogen host shifts\n",
"furthermore\n",
"species-level genetic variation\n",
"trsvs\n",
"diseases\n",
"gradual decline\n",
"domesticates\n",
"systemic invasion"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"high mutation rates\n",
"pathogenesis\n",
"entire body\n",
"humans\n",
"plant hosts\n",
"infections\n",
"virions\n",
"plant\n",
"varroa"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"infectious diseases\n",
"winter colony collapse\n",
"infected colonies\n",
"rna viruses\n",
"gastric cecum"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"trsv-infected individuals\n",
"instances\n",
"host ranges\n",
"health\n",
"viruses\n",
"study\n",
"bees\n",
"ectoparasitic varroa\n",
"present study\n",
"tree topology"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"animal kingdoms\n",
"phylogenetic analysis\n",
"colonies\n",
"feed\n",
"common ancestor\n",
"trsv\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
}
],
"metadata": {}
}
]
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment