Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
f6122f1c
Commit
f6122f1c
authored
Oct 23, 2014
by
Mathieu Rodic
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEATURE] file parsers - separate parsing from extraction
Work in progress...
parent
86bbf12a
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
605 additions
and
97 deletions
+605
-97
Test Pubmed parsing-checkpoint.ipynb
.ipynb_checkpoints/Test Pubmed parsing-checkpoint.ipynb
+127
-0
Test ISI parsing.ipynb
Test ISI parsing.ipynb
+34
-6
Test Pubmed parsing.ipynb
Test Pubmed parsing.ipynb
+265
-0
FileParser.py
parsing/FileParsers/FileParser.py
+102
-46
PubmedFileParser.py
parsing/FileParsers/PubmedFileParser.py
+38
-45
RisFileParser.py
parsing/FileParsers/RisFileParser.py
+39
-0
No files found.
.ipynb_checkpoints/Test Pubmed parsing-checkpoint.ipynb
0 → 100644
View file @
f6122f1c
{
"metadata": {
"name": "",
"signature": "sha256:e0c3b2efe7c205a29dc4e028b10ffb7b9d0569f35c4b426febdf523069abffdb"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pprint import pprint\n",
"from node.models import Node, NodeType, Language, Ngram\n",
"from django.contrib.auth.models import User\n",
"import parsing\n",
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Define user\n",
"try:\n",
" user = User.objects.get(username='Mat')\n",
"except:\n",
" user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
" user.save()\n",
"\n",
"# Define document types\n",
"nodetypes = {}\n",
"for name in ['Corpus', 'Document']:\n",
" try:\n",
" nodetypes[name] = NodeType.objects.get(name=name)\n",
" except:\n",
" nodetypes[name] = NodeType(name=name)\n",
" nodetypes[name].save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()\n",
"corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
"corpus.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser.parse(corpus)\n",
"print('Ok!')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
}
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for node_ngram in corpus.children.first().node_ngram_set.all():\n",
" print(node_ngram.ngram.terms)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
\ No newline at end of file
Test ISI parsing.ipynb
View file @
f6122f1c
{
"metadata": {
"name": "",
"signature": "sha256:
eac7c9b22e240bb0ef6d0aeec21261194d84a3f0ba53cd02af69f80d30ec5a17
"
"signature": "sha256:
70c2c8a4c8089e61195ee9da9232043152cf5e6c658a32115c0dcf990c2e98af
"
},
"nbformat": 3,
"nbformat_minor": 0,
...
...
@@ -122,17 +122,34 @@
],
"language": "python",
"metadata": {},
"outputs": []
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"d = dateutil.parser.parse(\"2014 OCT 11 1:2:3\")"
"import locale\n",
"locale.setlocale(locale.LC_ALL, \"fr_FR\")\n",
"d = dateutil.parser.parse(\"20 janvier 2004\")"
],
"language": "python",
"metadata": {},
"outputs": []
"outputs": [
{
"ename": "TypeError",
"evalue": "'NoneType' object is not iterable",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-2-0756678732db>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mlocale\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mlocale\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msetlocale\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlocale\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mLC_ALL\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"fr_FR\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0md\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdateutil\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"20 janvier 2004\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m/home/mat/projects/gargantext/myvenv/lib/python3.4/site-packages/dateutil/parser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(timestr, parserinfo, **kwargs)\u001b[0m\n\u001b[0;32m 746\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparserinfo\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimestr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 747\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 748\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mDEFAULTPARSER\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimestr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 749\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 750\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/mat/projects/gargantext/myvenv/lib/python3.4/site-packages/dateutil/parser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(self, timestr, default, ignoretz, tzinfos, **kwargs)\u001b[0m\n\u001b[0;32m 308\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 309\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 310\u001b[1;33m \u001b[0mres\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mskipped_tokens\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_parse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimestr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 311\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 312\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mres\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mTypeError\u001b[0m: 'NoneType' object is not iterable"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
...
...
@@ -142,7 +159,17 @@
],
"language": "python",
"metadata": {},
"outputs": []
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"'2014-02-02 00:00:00'"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
...
...
@@ -152,7 +179,8 @@
],
"language": "python",
"metadata": {},
"outputs": []
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
...
...
Test Pubmed parsing.ipynb
0 → 100644
View file @
f6122f1c
This diff is collapsed.
Click to expand it.
parsing/FileParsers/FileParser.py
View file @
f6122f1c
...
...
@@ -3,12 +3,12 @@ from parsing.NgramsExtractors import *
import
collections
import
dateutil.parser
import
zipfile
class
NgramCache
:
"""
This allows the fast retrieval of ngram ids
from the cache instead of using the database for every call
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time
"""
def
__init__
(
self
,
language
):
...
...
@@ -35,9 +35,9 @@ class NgramCaches(collections.defaultdict):
"""Base class for performing files parsing depending on their type.
"""
class
FileParser
:
"""Base class for performing files parsing depending on their type.
"""
def
__init__
(
self
,
file
=
None
,
filepath
=
""
,
encoding
=
"utf8"
):
# ...get the file item...
...
...
@@ -54,11 +54,10 @@ class FileParser:
self
.
_languages_fullname
=
{
language
.
fullname
.
lower
():
language
for
language
in
languages
}
self
.
_languages_iso2
=
{
language
.
iso2
.
lower
():
language
for
language
in
languages
}
self
.
_languages_iso3
=
{
language
.
iso3
.
lower
():
language
for
language
in
languages
}
#self.parse()
"""Extract the ngrams from a given text.
"""
def
extract_ngrams
(
self
,
text
,
language
):
"""Extract the ngrams from a given text.
"""
# Get the appropriate ngrams extractor, if it exists
if
language
not
in
self
.
_extractors
:
extractor
=
None
...
...
@@ -75,20 +74,13 @@ class FileParser:
for
ngram
in
extractor
.
extract_ngrams
(
text
):
ngram_text
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
tokens
.
append
(
ngram_text
)
return
collections
.
Counter
(
# [token for token, tag in extractor.extract_ngrams(text)]
tokens
)
return
collections
.
Counter
(
tokens
)
else
:
return
dict
()
#TODO
# * make it possible to tag and parse separately
# * only tags some data (only titles, titles & abstracts, some chapters...)
"""Add a document to the database.
"""
def
create_document
(
self
,
parentNode
,
title
,
contents
,
language
,
metadata
,
guid
=
None
):
def
create_document
(
self
,
parentNode
,
title
,
metadata
,
guid
=
None
):
"""Add a document to the database.
"""
metadata
=
self
.
format_metadata
(
metadata
)
# create or retrieve a resource for that document, based on its user id
# if guid is None:
...
...
@@ -103,6 +95,10 @@ class FileParser:
# if parentNode.descendants().filter(resource=resource).exists():
# return None
# create the document itself
try
:
language
=
self
.
_languages_iso3
[
metadata
[
"language_iso3"
]]
except
:
language
=
None
childNode
=
Node
(
user
=
parentNode
.
user
,
type
=
self
.
_document_nodetype
,
...
...
@@ -113,39 +109,74 @@ class FileParser:
parent
=
parentNode
)
childNode
.
save
()
# parse it!
ngrams
=
self
.
extract_ngrams
(
contents
,
language
)
# we are already in a transaction, so no use doing another one (or is there?)
ngramcache
=
self
.
_ngramcaches
[
language
]
for
terms
,
occurences
in
ngrams
.
items
():
ngram
=
ngramcache
[
terms
]
Node_Ngram
(
node
=
childNode
,
ngram
=
ngram
,
occurences
=
occurences
)
.
save
()
# return the created document
return
childNode
"""Useful method to detect the document encoding.
Not sure it should be here actually.
"""
def
detect_encoding
(
self
,
string
):
# see the chardet library
"""Useful method to detect the document encoding.
"""
pass
"""Parse the data.
This method shall be overriden by inherited classes.
"""
def
parse
(
self
):
def
_parse
(
self
,
parentNode
,
file
):
"""This method shall be overriden by inherited classes."""
return
list
()
def
parse
(
self
,
parentNode
,
file
=
None
):
"""Parse the files found in the file.
This method shall be overriden by inherited classes.
"""
if
file
is
None
:
with
transaction
.
atomic
():
self
.
parse
(
parentNode
,
self
.
_file
)
if
zipfile
.
is_zipfile
(
file
):
with
zipfile
.
ZipFile
(
file
)
as
zipArchive
:
for
filename
in
zipArchive
.
namelist
():
self
.
parse
(
parentNode
,
zipArchive
.
open
(
filename
,
"r"
))
else
:
self
.
_parse
(
parentNode
,
file
)
def
extract
(
self
,
parentNode
,
keys
):
"""Extract ngrams from the child nodes, given a list of field names."""
# get all the descendants of type "document"
childNodes
=
parentNode
.
descendants
()
.
filter
(
type
=
self
.
_document_nodetype
)
with
transaction
.
atomic
():
for
childNode
in
childNodes
:
# most importantly...
metadata
=
childNode
.
metadata
# which extractor shall we use?
if
language
not
in
self
.
_extractors
:
extractor
=
None
if
language
.
iso2
==
'en'
:
# use English
extractor
=
EnglishNgramsExtractor
()
elif
language
.
iso2
==
'fr'
:
# use French
extractor
=
FrenchNgramsExtractor
()
else
:
# no recognized language has been specified...
continue
self
.
_extractors
[
language
]
=
extractor
# extract ngrams from every field, find the id, count them
ngrams
=
collections
.
defaultdict
(
int
)
ngramscache
=
self
.
_ngramcaches
[
language
]
for
key
in
keys
:
for
ngram
in
extractor
.
extract_ngrams
(
text
):
ngram_text
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
ngram_id
=
ngramscache
[
ngramtext
]
.
id
ngrams
[
ngram_id
]
+=
1
# insert node/ngram associations in the database
for
ngram_id
,
occurences
in
ngrams
.
items
():
Node_Ngram
(
node_id
=
childNode
.
id
,
ngram_id
=
ngram_id
,
occurences
=
occurences
)
.
save
()
def
format_metadata_dates
(
self
,
metadata
):
"""Format the dates found in the metadata.
Example: {"publication_date": "2014-10-23 09:57:42"} -> {...}
Examples:
{"publication_date": "2014-10-23 09:57:42"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014"}
"""
# First, check the split dates...
...
...
@@ -185,8 +216,33 @@ class FileParser:
# finally, return the result!
return
metadata
def
format_metadata_languages
(
self
,
metadata
):
"""format the languages found in the metadata."""
try
:
if
"language_fullname"
in
metadata
:
language
=
self
.
_languages_fullname
[
metadata
[
"language_fullname"
]
.
lower
()]
elif
"language_iso3"
in
metadata
:
language
=
self
.
_languages_iso3
[
metadata
[
"language_iso3"
]
.
lower
()]
elif
"language_iso2"
in
metadata
:
language
=
self
.
_languages_iso2
[
metadata
[
"language_iso2"
]
.
lower
()]
else
:
return
metadata
except
KeyError
:
# the language has not been found
for
key
in
[
"language_fullname"
,
"language_iso3"
,
"language_iso2"
]:
try
:
metadata
.
pop
(
key
)
except
:
continue
return
metadata
metadata
[
"language_iso2"
]
=
language
.
iso2
metadata
[
"language_iso3"
]
=
language
.
iso3
metadata
[
"language_fullname"
]
=
language
.
fullname
return
metadata
def
format_metadata
(
self
,
metadata
):
"""Format the metadata."""
metadata
=
self
.
format_metadata_dates
(
metadata
)
return
metadata
\ No newline at end of file
metadata
=
self
.
format_metadata_languages
(
metadata
)
return
metadata
parsing/FileParsers/PubmedFileParser.py
View file @
f6122f1c
...
...
@@ -2,54 +2,47 @@ from django.db import transaction
from
lxml
import
etree
from
parsing.FileParsers.FileParser
import
FileParser
from
parsing.NgramsExtractors
import
*
import
zipfile
import
datetime
class
PubmedFileParser
(
FileParser
):
def
parse
(
self
,
parentNode
=
None
,
tag
=
Tru
e
):
def
_parse
(
self
,
parentNode
,
fil
e
):
# open the file as XML
xml_parser
=
etree
.
XMLParser
(
resolve_entities
=
False
,
recover
=
True
)
documents
=
[]
xml
=
etree
.
parse
(
file
,
parser
=
xml_parser
)
xml_articles
=
xml
.
findall
(
'PubmedArticle'
)
with
transaction
.
atomic
():
with
zipfile
.
ZipFile
(
self
.
_file
)
as
zipFile
:
for
filename
in
zipFile
.
namelist
():
file
=
zipFile
.
open
(
filename
,
"r"
)
xml
=
etree
.
parse
(
file
,
parser
=
xml_parser
)
# parse all the articles, one by one
# all database operations should be performed within one transaction
xml_articles
=
xml
.
findall
(
'PubmedArticle'
)
for
xml_article
in
xml_articles
:
# extract data from the document
metadata
=
{}
metadata_path
=
{
"journal"
:
'MedlineCitation/Article/Journal/Title'
,
"title"
:
'MedlineCitation/Article/ArticleTitle'
,
"language_iso3"
:
'MedlineCitation/Article/Language'
,
"doi"
:
'PubmedData/ArticleIdList/ArticleId[type=doi]'
,
"abstract"
:
'MedlineCitation/Article/Abstract/AbstractText'
,
"publication_year"
:
'MedlineCitation/DateCreated/Year'
,
"publication_month"
:
'MedlineCitation/DateCreated/Month'
,
"publication_day"
:
'MedlineCitation/DateCreated/Day'
,
}
for
key
,
path
in
metadata_path
.
items
():
try
:
node
=
xml_article
.
find
(
path
)
metadata
[
key
]
=
node
.
text
except
:
metadata
[
key
]
=
""
contents
=
metadata
[
"abstract"
]
# create the document in the database
document
=
self
.
create_document
(
parentNode
=
parentNode
,
title
=
metadata
[
"title"
],
contents
=
contents
,
language
=
self
.
_languages_iso3
[
metadata
[
"language_iso3"
]
.
lower
()],
metadata
=
metadata
,
#guid = metadata["doi"],
)
if
document
:
documents
.
append
(
document
)
return
documents
# initialize the list of documents
documents
=
[]
# parse all the articles, one by one
# all database operations should be performed within one transaction
for
xml_article
in
xml_articles
:
# extract data from the document
metadata
=
{}
metadata_path
=
{
"journal"
:
'MedlineCitation/Article/Journal/Title'
,
"title"
:
'MedlineCitation/Article/ArticleTitle'
,
"language_iso3"
:
'MedlineCitation/Article/Language'
,
"doi"
:
'PubmedData/ArticleIdList/ArticleId[@type=doi]'
,
"abstract"
:
'MedlineCitation/Article/Abstract/AbstractText'
,
"publication_year"
:
'MedlineCitation/DateCreated/Year'
,
"publication_month"
:
'MedlineCitation/DateCreated/Month'
,
"publication_day"
:
'MedlineCitation/DateCreated/Day'
,
}
for
key
,
path
in
metadata_path
.
items
():
try
:
node
=
xml_article
.
find
(
path
)
metadata
[
key
]
=
node
.
text
except
:
metadata
[
key
]
=
""
contents
=
metadata
[
"abstract"
]
# create the document in the database
document
=
self
.
create_document
(
parentNode
=
parentNode
,
title
=
metadata
[
"title"
],
metadata
=
metadata
,
#guid = metadata["doi"],
)
if
document
:
documents
.
append
(
document
)
# return the list of documents
return
documents
parsing/FileParsers/RisFileParser.py
0 → 100644
View file @
f6122f1c
from
django.db
import
transaction
from
parsing.FileParsers.FileParser
import
FileParser
class
RisFileParser
(
FileParser
):
_parameters
=
{
}
def
_parse
(
self
,
parentNode
,
file
):
metadata
=
{}
last_key
=
None
last_values
=
[]
with
transaction
.
atomic
():
for
line
in
self
.
_file
:
if
len
(
line
)
>
2
:
parameter_key
=
line
[:
2
]
if
parameter_key
!=
b
' '
and
parameter_key
!=
last_key
:
if
last_key
in
self
.
_parameters
:
parameter
=
self
.
_parameters
[
last_key
]
if
parameter
[
"type"
]
==
"metadata"
:
separator
=
parameter
[
"separator"
]
if
"separator"
in
parameter
else
""
metadata
[
parameter
[
"key"
]]
=
separator
.
join
(
last_values
)
elif
parameter
[
"type"
]
==
"delimiter"
:
language
=
self
.
_languages_fullname
[
metadata
[
"language"
]
.
lower
()]
# self.create_document(
# parentNode = parentNode,
# title = metadata["title"],
# metadata = metadata,
# guid = metadata["doi"]
# )
print
(
self
.
format_metadata
(
metadata
))
print
()
metadata
=
{}
last_key
=
parameter_key
last_values
=
[]
last_values
.
append
(
line
[
3
:
-
1
]
.
decode
())
self
.
_file
.
close
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment