[FEATURE] ISI parser - Almost working

0720146f · Mathieu Rodic · 00f1333b · 0720146f · 0720146f · 0720146f
Commit 0720146f authored Oct 23, 2014 by Mathieu Rodic
5 changed files
--- a/.gitignore
+++ b/.gitignore
+__pycache__/
+parsing/Taggers/treetagger/
--- a/Languages integration (only run once!).ipynb
+++ b/Languages integration (only run once!).ipynb
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:cabaff3edb8995fecf78ead33fd8af0b9ada1fe75811cb60200317c70ea3079e"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import pycountry\n",
+      "from node.models import Language"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 5
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import pycountry\n",
+      "\n",
+      "for language in pycountry.languages:\n",
+      "    try:\n",
+      "        implemented = 1 if language.alpha2 in ['en', 'fr'] else 0\n",
+      "        Language(iso2=language.alpha2, iso3=language.terminology, fullname=language.name, implemented=implemented).save()\n",
+      "    except:\n",
+      "        pass\n",
+      "    "
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 10
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "for language in Language.objects.all():\n",
+      "    print(language)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "Afar\n",
+        "Abkhazian\n",
+        "Afrikaans\n",
+        "Akan\n",
+        "Albanian\n",
+        "Amharic\n",
+        "Arabic\n",
+        "Aragonese\n",
+        "Armenian\n",
+        "Assamese\n",
+        "Avaric\n",
+        "Avestan\n",
+        "Aymara\n",
+        "Azerbaijani\n",
+        "Bashkir\n",
+        "Bambara\n",
+        "Basque\n",
+        "Belarusian\n",
+        "Bengali\n",
+        "Bihari languages\n",
+        "Bislama\n",
+        "Bosnian\n",
+        "Breton\n",
+        "Bulgarian\n",
+        "Burmese\n",
+        "Catalan; Valencian\n",
+        "Chamorro\n",
+        "Chechen\n",
+        "Chinese\n",
+        "Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic\n",
+        "Chuvash\n",
+        "Cornish\n",
+        "Corsican\n",
+        "Cree\n",
+        "Czech\n",
+        "Danish\n",
+        "Divehi; Dhivehi; Maldivian\n",
+        "Dutch; Flemish\n",
+        "Dzongkha\n",
+        "English\n",
+        "Esperanto\n",
+        "Estonian\n",
+        "Ewe\n",
+        "Faroese\n",
+        "Fijian\n",
+        "Finnish\n",
+        "French\n",
+        "Western Frisian\n",
+        "Fulah\n",
+        "Georgian\n",
+        "German\n",
+        "Gaelic; Scottish Gaelic\n",
+        "Irish\n",
+        "Galician\n",
+        "Manx\n",
+        "Greek, Modern (1453-)\n",
+        "Guarani\n",
+        "Gujarati\n",
+        "Haitian; Haitian Creole\n",
+        "Hausa\n",
+        "Hebrew\n",
+        "Herero\n",
+        "Hindi\n",
+        "Hiri Motu\n",
+        "Croatian\n",
+        "Hungarian\n",
+        "Igbo\n",
+        "Icelandic\n",
+        "Ido\n",
+        "Sichuan Yi; Nuosu\n",
+        "Inuktitut\n",
+        "Interlingue; Occidental\n",
+        "Interlingua (International Auxiliary Language Association)\n",
+        "Indonesian\n",
+        "Inupiaq\n",
+        "Italian\n",
+        "Javanese\n",
+        "Japanese\n",
+        "Kalaallisut; Greenlandic\n",
+        "Kannada\n",
+        "Kashmiri\n",
+        "Kanuri\n",
+        "Kazakh\n",
+        "Central Khmer\n",
+        "Kikuyu; Gikuyu\n",
+        "Kinyarwanda\n",
+        "Kirghiz; Kyrgyz\n",
+        "Komi\n",
+        "Kongo\n",
+        "Korean\n",
+        "Kuanyama; Kwanyama\n",
+        "Kurdish\n",
+        "Lao\n",
+        "Latin\n",
+        "Latvian\n",
+        "Limburgan; Limburger; Limburgish\n",
+        "Lingala\n",
+        "Lithuanian\n",
+        "Luxembourgish; Letzeburgesch\n",
+        "Luba-Katanga\n",
+        "Ganda\n",
+        "Macedonian\n",
+        "Marshallese\n",
+        "Malayalam\n",
+        "Maori\n",
+        "Marathi\n",
+        "Malay\n",
+        "Malagasy\n",
+        "Maltese\n",
+        "Moldavian; Moldovan\n",
+        "Mongolian\n",
+        "Nauru\n",
+        "Navajo; Navaho\n",
+        "Ndebele, South; South Ndebele\n",
+        "Ndebele, North; North Ndebele\n",
+        "Ndonga\n",
+        "Nepali\n",
+        "Norwegian Nynorsk; Nynorsk, Norwegian\n",
+        "Bokm\u00e5l, Norwegian; Norwegian Bokm\u00e5l\n",
+        "Norwegian\n",
+        "Chichewa; Chewa; Nyanja\n",
+        "Occitan (post 1500)\n",
+        "Ojibwa\n",
+        "Oriya\n",
+        "Oromo\n",
+        "Ossetian; Ossetic\n",
+        "Panjabi; Punjabi\n",
+        "Persian\n",
+        "Pali\n",
+        "Polish\n",
+        "Portuguese\n",
+        "Pushto; Pashto\n",
+        "Quechua\n",
+        "Romansh\n",
+        "Romanian\n",
+        "Rundi\n",
+        "Russian\n",
+        "Sango\n",
+        "Sanskrit\n",
+        "Sinhala; Sinhalese\n",
+        "Slovak\n",
+        "Slovenian\n",
+        "Northern Sami\n",
+        "Samoan\n",
+        "Shona\n",
+        "Sindhi\n",
+        "Somali\n",
+        "Sotho, Southern\n",
+        "Spanish; Castilian\n",
+        "Sardinian\n",
+        "Serbian\n",
+        "Swati\n",
+        "Sundanese\n",
+        "Swahili\n",
+        "Swedish\n",
+        "Tahitian\n",
+        "Tamil\n",
+        "Tatar\n",
+        "Telugu\n",
+        "Tajik\n",
+        "Tagalog\n",
+        "Thai\n",
+        "Tibetan\n",
+        "Tigrinya\n",
+        "Tonga (Tonga Islands)\n",
+        "Tswana\n",
+        "Tsonga\n",
+        "Turkmen\n",
+        "Turkish\n",
+        "Twi\n",
+        "Uighur; Uyghur\n",
+        "Ukrainian\n",
+        "Urdu\n",
+        "Uzbek\n",
+        "Venda\n",
+        "Vietnamese\n",
+        "Volap\u00fck\n",
+        "Welsh\n",
+        "Walloon\n",
+        "Wolof\n",
+        "Xhosa\n",
+        "Yiddish\n",
+        "Yoruba\n",
+        "Zhuang; Chuang\n",
+        "Zulu\n"
+       ]
+      }
+     ],
+     "prompt_number": 11
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
\ No newline at end of file
--- a/Test ISI parsing.ipynb
+++ b/Test ISI parsing.ipynb
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:634b0bc0fc552e28c568a3fa59f7567a562a69376a589c6f7c3960bcaf7a94e8"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from parsing.FileParsers import *"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "\"RE abcdefgh\\n\"[3:-1]"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 2,
+       "text": [
+        "'abcdefgh'"
+       ]
+      }
+     ],
+     "prompt_number": 2
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "parser = IsiFileParser(filepath='/home/mat/projects/gargantext/data_samples/isi.txt')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 3
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "parser.parse()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "{'concatenate': ', ', 'key': 'authors', 'type': 'metadata'}\n"
+       ]
+      },
+      {
+       "ename": "TypeError",
+       "evalue": "sequence item 0: expected str instance, bytes found",
+       "output_type": "pyerr",
+       "traceback": [
+        "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+        "\u001b[1;32m<ipython-input-4-785d3def061e>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+        "\u001b[1;32m/home/mat/projects/gargantext/gargantext/parsing/FileParsers/IsiFileParser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(self, parentNode, tag)\u001b[0m\n\u001b[0;32m     23\u001b[0m                         \u001b[1;32mif\u001b[0m \u001b[0mparameter\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"type\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m\"metadata\"\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     24\u001b[0m                             \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparameter\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 25\u001b[1;33m                             \u001b[0mmetadata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mparameter\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"key\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparameter\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"concatenate\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlast_values\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     26\u001b[0m                         \u001b[1;32melif\u001b[0m \u001b[0mparameter\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"type\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m\"delimiter\"\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     27\u001b[0m                             \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmetadata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+        "\u001b[1;31mTypeError\u001b[0m: sequence item 0: expected str instance, bytes found"
+       ]
+      }
+     ],
+     "prompt_number": 4
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
\ No newline at end of file
--- a/parsing/FileParsers/IsiFileParser.py
+++ b/parsing/FileParsers/IsiFileParser.py
@@ -5,28 +5,29 @@ from parsing.FileParsers.FileParser import FileParser
 class IsiFileParser(FileParser):
    _parameters = {
-        "ER":   {"type": "delimiter"},
+        b"ER":  {"type": "delimiter"},
-        "AU":   {"type": "metadata", "key": "authors", "concatenate": False},
+        b"TI":  {"type": "metadata", "key": "title", "concatenate": b" "},
-        "AB":   {"type": "metadata", "key": "abstract", "concatenate": True},
+        b"AU":  {"type": "metadata", "key": "authors", "concatenate": b", "},
+        b"AB":  {"type": "metadata", "key": "abstract", "concatenate": b" "},
    }
    def parse(self, parentNode=None, tag=True):
        metadata = {}
        last_key = None
        last_values = []
-        for line in self.file:
+        for line in self._file:
            if len(line) > 2:
                parameter_key = line[:2]
-                if parameter_key != last_key:
+                if parameter_key != b'  ' and parameter_key != last_key:
-                    if last_key is not None:
+                    if last_key in self._parameters:
                        parameter = self._parameters[last_key]
                        if parameter["type"] == "metadata":
-                            metadata[parameter["key"]] = ' '.join(last_values) if parameter["concatenate"] else last_values
+                            metadata[parameter["key"]] = parameter["concatenate"].join(last_values)
-                        elif parameter["type"] == "metadata":
+                        elif parameter["type"] == "delimiter":
                            print(metadata)
                            metadata = {}
                            break
-                    parameter = self._parameters[last_key]
                    last_key = parameter_key
                    last_values = []
                last_values.append(line[3:-1])
\ No newline at end of file
+        self.file.close()
\ No newline at end of file
--- a/test_parser_ngramextractor (Mat).ipynb
+++ b/test_parser_ngramextractor (Mat).ipynb
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:71dcc854ee670084dd2d3795a96e0faa7d3feb1f1958d41b08c32fe1a0d70be9"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from pprint import pprint\n",
+      "from node.models import Node, NodeType, Language, Ngram\n",
+      "from django.contrib.auth.models import User\n",
+      "import parsing\n",
+      "from parsing.FileParsers import *"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "# Define user\n",
+      "try:\n",
+      "    user = User.objects.get(username='Mat')\n",
+      "except:\n",
+      "    user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
+      "    user.save()\n",
+      "\n",
+      "# Define document types\n",
+      "nodetypes = {}\n",
+      "for name in ['Corpus', 'Document']:\n",
+      "    try:\n",
+      "        nodetypes[name] = NodeType.objects.get(name=name)\n",
+      "    except:\n",
+      "        nodetypes[name] = NodeType(name=name)\n",
+      "        nodetypes[name].save()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 2
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "Node.objects.all().delete()\n",
+      "corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
+      "corpus.save()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 3
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 4
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "fileparser.parse(corpus)\n",
+      "print('Ok!')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "Warning: parsing empty text\n",
+        "Warning: parsing empty text\n",
+        "Warning: parsing empty text"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Warning: parsing empty text\n",
+        "Warning: parsing empty text"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Warning: parsing empty text\n",
+        "Warning: parsing empty text"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Warning: parsing empty text"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Warning: parsing empty text"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Warning: parsing empty text"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Warning: parsing empty text"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Warning: parsing empty text"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Warning: parsing empty text"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Warning: parsing empty text"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Warning: parsing empty text"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Ok!"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n"
+       ]
+      }
+     ],
+     "prompt_number": 5
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "for node_ngram in corpus.children.first().node_ngram_set.all():\n",
+      "    print(node_ngram.ngram.terms)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "plant-pathogenic rna virus\n",
+        "significant source\n",
+        "result\n",
+        "host populations\n",
+        "in\n",
+        "arthropod hosts\n",
+        "unique example\n",
+        "spread\n",
+        "tobacco ringspot\n",
+        "colony survival\n",
+        "apis mellifera\n",
+        "other bee viruses"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "negative impact\n",
+        "threat\n",
+        "honeybees\n",
+        "varroa mites\n",
+        "intracellular life cycle\n",
+        "virus\n",
+        "conjunction\n",
+        "honeybee hosts\n",
+        "bee hemolymph\n",
+        "distinct lineage\n",
+        "transkingdom host alteration"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "monophyletic clade\n",
+        "prevalence\n",
+        "winter\n",
+        "pathogen host shifts\n",
+        "furthermore\n",
+        "species-level genetic variation\n",
+        "trsvs\n",
+        "diseases\n",
+        "gradual decline\n",
+        "domesticates\n",
+        "systemic invasion"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "high mutation rates\n",
+        "pathogenesis\n",
+        "entire body\n",
+        "humans\n",
+        "plant hosts\n",
+        "infections\n",
+        "virions\n",
+        "plant\n",
+        "varroa"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "infectious diseases\n",
+        "winter colony collapse\n",
+        "infected colonies\n",
+        "rna viruses\n",
+        "gastric cecum"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "trsv-infected individuals\n",
+        "instances\n",
+        "host ranges\n",
+        "health\n",
+        "viruses\n",
+        "study\n",
+        "bees\n",
+        "ectoparasitic varroa\n",
+        "present study\n",
+        "tree topology"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "animal kingdoms\n",
+        "phylogenetic analysis\n",
+        "colonies\n",
+        "feed\n",
+        "common ancestor\n",
+        "trsv\n"
+       ]
+      }
+     ],
+     "prompt_number": 7
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 6
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
\ No newline at end of file