Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
86bbf12a
Commit
86bbf12a
authored
Oct 23, 2014
by
Mathieu Rodic
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEATURE] Dates in parsing metadata - All the dates are being formatted in FileParser
parent
5036bc48
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
119 additions
and
569 deletions
+119
-569
Test ISI parsing-checkpoint.ipynb
.ipynb_checkpoints/Test ISI parsing-checkpoint.ipynb
+59
-64
test_parser_ngramextractor (Mat)-checkpoint.ipynb
...kpoints/test_parser_ngramextractor (Mat)-checkpoint.ipynb
+0
-127
Test ISI parsing.ipynb
Test ISI parsing.ipynb
+49
-33
FileParser.py
parsing/FileParsers/FileParser.py
+1
-0
PubmedFileParser.py
parsing/FileParsers/PubmedFileParser.py
+10
-13
test_parser_ngramextractor (Mat).ipynb
test_parser_ngramextractor (Mat).ipynb
+0
-332
No files found.
.ipynb_checkpoints/Test ISI parsing-checkpoint.ipynb
View file @
86bbf12a
This diff is collapsed.
Click to expand it.
.ipynb_checkpoints/test_parser_ngramextractor (Mat)-checkpoint.ipynb
deleted
100644 → 0
View file @
5036bc48
{
"metadata": {
"name": "",
"signature": "sha256:e0c3b2efe7c205a29dc4e028b10ffb7b9d0569f35c4b426febdf523069abffdb"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pprint import pprint\n",
"from node.models import Node, NodeType, Language, Ngram\n",
"from django.contrib.auth.models import User\n",
"import parsing\n",
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Define user\n",
"try:\n",
" user = User.objects.get(username='Mat')\n",
"except:\n",
" user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
" user.save()\n",
"\n",
"# Define document types\n",
"nodetypes = {}\n",
"for name in ['Corpus', 'Document']:\n",
" try:\n",
" nodetypes[name] = NodeType.objects.get(name=name)\n",
" except:\n",
" nodetypes[name] = NodeType(name=name)\n",
" nodetypes[name].save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()\n",
"corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
"corpus.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser.parse(corpus)\n",
"print('Ok!')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
}
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for node_ngram in corpus.children.first().node_ngram_set.all():\n",
" print(node_ngram.ngram.terms)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
\ No newline at end of file
Test ISI parsing.ipynb
View file @
86bbf12a
This diff is collapsed.
Click to expand it.
parsing/FileParsers/FileParser.py
View file @
86bbf12a
...
@@ -89,6 +89,7 @@ class FileParser:
...
@@ -89,6 +89,7 @@ class FileParser:
"""Add a document to the database.
"""Add a document to the database.
"""
"""
def
create_document
(
self
,
parentNode
,
title
,
contents
,
language
,
metadata
,
guid
=
None
):
def
create_document
(
self
,
parentNode
,
title
,
contents
,
language
,
metadata
,
guid
=
None
):
metadata
=
self
.
format_metadata
(
metadata
)
# create or retrieve a resource for that document, based on its user id
# create or retrieve a resource for that document, based on its user id
# if guid is None:
# if guid is None:
# resource = Resource(guid=guid)
# resource = Resource(guid=guid)
...
...
parsing/FileParsers/PubmedFileParser.py
View file @
86bbf12a
...
@@ -16,7 +16,6 @@ class PubmedFileParser(FileParser):
...
@@ -16,7 +16,6 @@ class PubmedFileParser(FileParser):
with
zipfile
.
ZipFile
(
self
.
_file
)
as
zipFile
:
with
zipfile
.
ZipFile
(
self
.
_file
)
as
zipFile
:
for
filename
in
zipFile
.
namelist
():
for
filename
in
zipFile
.
namelist
():
file
=
zipFile
.
open
(
filename
,
"r"
)
file
=
zipFile
.
open
(
filename
,
"r"
)
# print(file.read())
xml
=
etree
.
parse
(
file
,
parser
=
xml_parser
)
xml
=
etree
.
parse
(
file
,
parser
=
xml_parser
)
# parse all the articles, one by one
# parse all the articles, one by one
...
@@ -24,19 +23,17 @@ class PubmedFileParser(FileParser):
...
@@ -24,19 +23,17 @@ class PubmedFileParser(FileParser):
xml_articles
=
xml
.
findall
(
'PubmedArticle'
)
xml_articles
=
xml
.
findall
(
'PubmedArticle'
)
for
xml_article
in
xml_articles
:
for
xml_article
in
xml_articles
:
# extract data from the document
# extract data from the document
date_year
=
int
(
xml_article
.
find
(
'MedlineCitation/DateCreated/Year'
)
.
text
)
metadata
=
{}
date_month
=
int
(
xml_article
.
find
(
'MedlineCitation/DateCreated/Month'
)
.
text
)
date_day
=
int
(
xml_article
.
find
(
'MedlineCitation/DateCreated/Day'
)
.
text
)
metadata
=
{
"date_pub"
:
'
%
s-
%
s-
%
s'
%
(
date_year
,
date_month
,
date_day
),
}
metadata_path
=
{
metadata_path
=
{
"journal"
:
'MedlineCitation/Article/Journal/Title'
,
"journal"
:
'MedlineCitation/Article/Journal/Title'
,
"title"
:
'MedlineCitation/Article/ArticleTitle'
,
"title"
:
'MedlineCitation/Article/ArticleTitle'
,
"language_iso3"
:
'MedlineCitation/Article/Language'
,
"language_iso3"
:
'MedlineCitation/Article/Language'
,
"doi"
:
'PubmedData/ArticleIdList/ArticleId[type=doi]'
,
"doi"
:
'PubmedData/ArticleIdList/ArticleId[type=doi]'
,
"abstract"
:
'MedlineCitation/Article/Abstract/AbstractText'
"abstract"
:
'MedlineCitation/Article/Abstract/AbstractText'
,
}
"publication_year"
:
'MedlineCitation/DateCreated/Year'
,
"publication_month"
:
'MedlineCitation/DateCreated/Month'
,
"publication_day"
:
'MedlineCitation/DateCreated/Day'
,
}
for
key
,
path
in
metadata_path
.
items
():
for
key
,
path
in
metadata_path
.
items
():
try
:
try
:
node
=
xml_article
.
find
(
path
)
node
=
xml_article
.
find
(
path
)
...
...
test_parser_ngramextractor (Mat).ipynb
deleted
100644 → 0
View file @
5036bc48
{
"metadata": {
"name": "",
"signature": "sha256:71dcc854ee670084dd2d3795a96e0faa7d3feb1f1958d41b08c32fe1a0d70be9"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pprint import pprint\n",
"from node.models import Node, NodeType, Language, Ngram\n",
"from django.contrib.auth.models import User\n",
"import parsing\n",
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Define user\n",
"try:\n",
" user = User.objects.get(username='Mat')\n",
"except:\n",
" user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
" user.save()\n",
"\n",
"# Define document types\n",
"nodetypes = {}\n",
"for name in ['Corpus', 'Document']:\n",
" try:\n",
" nodetypes[name] = NodeType.objects.get(name=name)\n",
" except:\n",
" nodetypes[name] = NodeType(name=name)\n",
" nodetypes[name].save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()\n",
"corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
"corpus.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser.parse(corpus)\n",
"print('Ok!')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Ok!"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for node_ngram in corpus.children.first().node_ngram_set.all():\n",
" print(node_ngram.ngram.terms)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"plant-pathogenic rna virus\n",
"significant source\n",
"result\n",
"host populations\n",
"in\n",
"arthropod hosts\n",
"unique example\n",
"spread\n",
"tobacco ringspot\n",
"colony survival\n",
"apis mellifera\n",
"other bee viruses"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"negative impact\n",
"threat\n",
"honeybees\n",
"varroa mites\n",
"intracellular life cycle\n",
"virus\n",
"conjunction\n",
"honeybee hosts\n",
"bee hemolymph\n",
"distinct lineage\n",
"transkingdom host alteration"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"monophyletic clade\n",
"prevalence\n",
"winter\n",
"pathogen host shifts\n",
"furthermore\n",
"species-level genetic variation\n",
"trsvs\n",
"diseases\n",
"gradual decline\n",
"domesticates\n",
"systemic invasion"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"high mutation rates\n",
"pathogenesis\n",
"entire body\n",
"humans\n",
"plant hosts\n",
"infections\n",
"virions\n",
"plant\n",
"varroa"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"infectious diseases\n",
"winter colony collapse\n",
"infected colonies\n",
"rna viruses\n",
"gastric cecum"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"trsv-infected individuals\n",
"instances\n",
"host ranges\n",
"health\n",
"viruses\n",
"study\n",
"bees\n",
"ectoparasitic varroa\n",
"present study\n",
"tree topology"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"animal kingdoms\n",
"phylogenetic analysis\n",
"colonies\n",
"feed\n",
"common ancestor\n",
"trsv\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
}
],
"metadata": {}
}
]
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment