Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
755a8d4d
Commit
755a8d4d
authored
Oct 29, 2014
by
Mathieu Rodic
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEATURE] The parser is working, directly from a Node instance!
parent
c50f2fff
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
169 additions
and
308 deletions
+169
-308
models.py
node/models.py
+35
-10
Caches.py
parsing/Caches.py
+23
-9
FileParser.py
parsing/FileParsers/FileParser.py
+41
-155
PubmedFileParser.py
parsing/FileParsers/PubmedFileParser.py
+26
-37
RisFileParser.py
parsing/FileParsers/RisFileParser.py
+18
-26
__init__.py
parsing/__init__.py
+0
-50
test-parsing_from_node.py
test-parsing_from_node.py
+26
-21
No files found.
node/models.py
View file @
755a8d4d
from
django.db
import
models
from
django.utils
import
timezone
from
django.contrib.auth.models
import
User
from
django_hstore
import
hstore
from
cte_tree.models
import
CTENode
,
Manager
#from cte_tree.fields import DepthField, PathField, OrderingField
from
parsing.Caches
import
LanguagesCache
from
parsing.FileParsers
import
*
from
time
import
time
from
collections
import
defaultdict
from
django.contrib.auth.models
import
User
from
collections
import
defaultdict
# Some usefull functions
# TODO: start the function name with an underscore (private)
...
...
@@ -28,7 +31,7 @@ class Language(models.Model):
def
__str__
(
self
):
return
self
.
fullname
class
Databas
eType
(
models
.
Model
):
class
Resourc
eType
(
models
.
Model
):
name
=
models
.
CharField
(
max_length
=
255
)
def
__str__
(
self
):
return
self
.
name
...
...
@@ -40,7 +43,7 @@ class Ngram(models.Model):
class
Resource
(
models
.
Model
):
guid
=
models
.
CharField
(
max_length
=
255
)
bdd_type
=
models
.
ForeignKey
(
Databas
eType
,
blank
=
True
,
null
=
True
)
type
=
models
.
ForeignKey
(
Resourc
eType
,
blank
=
True
,
null
=
True
)
file
=
models
.
FileField
(
upload_to
=
upload_to
,
blank
=
True
)
digest
=
models
.
CharField
(
max_length
=
32
)
# MD5 digest
...
...
@@ -89,12 +92,33 @@ class Node(CTENode):
node_resource
.
save
()
return
resource
def
parse
(
self
):
#
TODO: that's not very pretty...
# can't we make a simple join in Django?
def
parse
_resources
(
self
):
#
parse all resources into a list of metadata
metadata_list
=
[]
for
node_resource
in
self
.
node_resource
.
filter
(
parsed
=
False
):
# TODO: call parsers here
print
(
node_resource
.
resource
.
file
)
resource
=
node_resource
.
resource
parser
=
defaultdict
(
lambda
:
FileParser
.
FileParser
,
{
'pubmed'
:
PubmedFileParser
,
'isi'
:
IsiFileParser
,
'ris'
:
RisFileParser
,
'europress'
:
EuropressFileParser
,
})[
resource
.
type
.
name
]()
print
(
parser
)
metadata_list
+=
parser
.
parse
(
str
(
resource
.
file
))
# insert in the database!
type
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
langages_cache
=
LanguagesCache
()
Node
.
objects
.
bulk_create
([
Node
(
user
=
self
.
user
,
type
=
type
,
name
=
metadata
[
'title'
]
if
'title'
in
metadata
else
''
,
parent
=
self
,
language
=
langages_cache
[
metadata
[
'language_iso2'
]]
if
'language_iso2'
in
metadata
else
None
,
metadata
=
metadata
,
)
for
metadata
in
metadata_list
])
def
extract_ngrams
(
self
,
keys
,
cache
):
# what do we want from the cache?
...
...
@@ -118,6 +142,7 @@ class Node(CTENode):
weight
=
weight
)
class
Node_Resource
(
models
.
Model
):
node
=
models
.
ForeignKey
(
Node
,
related_name
=
'node_resource'
)
resource
=
models
.
ForeignKey
(
Resource
)
...
...
@@ -126,7 +151,7 @@ class Node_Resource(models.Model):
class
Node_Ngram
(
models
.
Model
):
node
=
models
.
ForeignKey
(
Node
)
ngram
=
models
.
ForeignKey
(
Ngram
)
weight
=
models
.
Integer
Field
()
weight
=
models
.
Float
Field
()
class
Project
(
Node
):
class
Meta
:
...
...
parsing/Caches.py
View file @
755a8d4d
import
collections
from
node.models
import
Ngram
import
node.models
from
parsing.NgramsExtractors
import
EnglishNgramsExtractor
,
FrenchNgramsExtractor
from
collections
import
defaultdict
class
NgramsCache
(
collections
.
defaultdict
):
class
NgramsCache
(
defaultdict
):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time.
This class is language-specific."""
...
...
@@ -17,14 +17,14 @@ class NgramsCache(collections.defaultdict):
"""If the terms are not yet present in the dictionary,
retrieve it from the database or insert it."""
try
:
ngram
=
Ngram
.
get
(
terms
=
terms
,
language
=
self
.
language
)
ngram
=
node
.
models
.
Ngram
.
get
(
terms
=
terms
,
language
=
self
.
language
)
except
:
ngram
=
Ngram
(
terms
=
terms
,
n
=
len
(
terms
.
split
()),
language
=
self
.
language
)
ngram
=
node
.
models
.
Ngram
(
terms
=
terms
,
n
=
len
(
terms
.
split
()),
language
=
self
.
language
)
ngram
.
save
()
self
[
terms
]
=
ngram
return
self
[
terms
]
class
NgramsCaches
(
collections
.
defaultdict
):
class
NgramsCaches
(
defaultdict
):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time."""
def
__missing__
(
self
,
language
):
...
...
@@ -33,7 +33,7 @@ class NgramsCaches(collections.defaultdict):
self
[
language
]
=
NgramsCache
(
language
)
return
self
[
language
]
class
NgramsExtractorsCache
(
collections
.
defaultdict
):
class
NgramsExtractorsCache
(
defaultdict
):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time."""
def
__missing__
(
self
,
key
):
...
...
@@ -64,11 +64,25 @@ class NgramsExtractorsCache(collections.defaultdict):
# return the proper extractor
return
self
[
key
]
class
LanguagesCache
(
defaultdict
):
def
__init__
(
self
):
for
language
in
node
.
models
.
Language
.
objects
.
all
():
self
[
language
.
iso2
.
lower
()]
=
language
self
[
language
.
iso3
.
lower
()]
=
language
self
[
language
.
fullname
.
lower
()]
=
language
def
__missing__
(
self
,
key
):
betterKey
=
key
.
strip
()
.
lower
()
self
[
key
]
=
self
[
betterKey
]
if
betterKey
in
self
else
None
return
self
[
betterKey
]
class
Cache
:
class
Cache
s
:
"""This is THE cache of the caches.
See NgramsCaches and NgramsExtractorsCache for better understanding."""
def
__init__
(
self
):
self
.
ngrams
=
NgramsCaches
()
self
.
extractors
=
NgramsExtractorsCache
()
self
.
languages
=
LanguagesCache
()
parsing/FileParsers/FileParser.py
View file @
755a8d4d
from
node.models
import
Node
,
NodeType
,
Language
,
Ngram
,
Node_Ngram
from
parsing.NgramsExtractors
import
*
import
collections
import
dateutil.parser
import
zipfile
from
parsing.Caches
import
LanguagesCache
class
FileParser
:
"""Base class for performing files parsing depending on their type.
"""
def
__init__
(
self
,
file
=
None
,
filepath
=
""
,
encoding
=
"utf8"
):
# ...get the file item...
if
file
is
None
:
self
.
_file
=
open
(
filepath
,
"rb"
)
else
:
self
.
_file
=
file
# cache for ngrams
self
.
_ngramcaches
=
NgramCaches
()
# extractors
self
.
_extractors
=
dict
()
self
.
_document_nodetype
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
languages
=
Language
.
objects
.
all
()
self
.
_languages_fullname
=
{
language
.
fullname
.
lower
():
language
for
language
in
languages
}
self
.
_languages_iso2
=
{
language
.
iso2
.
lower
():
language
for
language
in
languages
}
self
.
_languages_iso3
=
{
language
.
iso3
.
lower
():
language
for
language
in
languages
}
def
extract_ngrams
(
self
,
text
,
language
):
"""Extract the ngrams from a given text.
"""
# Get the appropriate ngrams extractor, if it exists
if
language
not
in
self
.
_extractors
:
extractor
=
None
if
language
.
iso2
==
'en'
:
extractor
=
EnglishNgramsExtractor
()
elif
language
.
iso2
==
'fr'
:
extractor
=
FrenchNgramsExtractor
()
self
.
_extractors
[
language
]
=
extractor
else
:
extractor
=
self
.
_extractors
[
language
]
# Extract the ngrams
if
extractor
:
tokens
=
[]
for
ngram
in
extractor
.
extract_ngrams
(
text
):
ngram_text
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
tokens
.
append
(
ngram_text
)
return
collections
.
Counter
(
tokens
)
else
:
return
dict
()
def
create_document
(
self
,
parentNode
,
title
,
metadata
,
guid
=
None
):
"""Add a document to the database.
"""
metadata
=
self
.
format_metadata
(
metadata
)
# create or retrieve a resource for that document, based on its user id
# if guid is None:
# resource = Resource(guid=guid)
# else:
# try:
# resource = Resource.get(guid=guid)
# except:
# resource = Resource(guid=guid)
# # If the parent node already has a child with this resource, pass
# # (is it a good thing?)
# if parentNode.descendants().filter(resource=resource).exists():
# return None
# create the document itself
try
:
language
=
self
.
_languages_iso3
[
metadata
[
"language_iso3"
]]
except
:
language
=
None
childNode
=
Node
(
user
=
parentNode
.
user
,
type
=
self
.
_document_nodetype
,
name
=
title
,
language
=
language
,
metadata
=
metadata
,
#resource = resource,
parent
=
parentNode
)
childNode
.
save
()
return
childNode
def
__init__
(
self
,
language_cache
=
None
):
self
.
_languages_cache
=
LanguagesCache
()
if
language_cache
is
None
else
language_cache
def
detect_encoding
(
self
,
string
):
"""Useful method to detect the document encoding.
"""
pass
def
_parse
(
self
,
parentNode
,
file
):
"""This method shall be overriden by inherited classes."""
return
list
()
def
parse
(
self
,
parentNode
,
file
=
None
):
"""Parse the files found in the file.
This method shall be overriden by inherited classes.
"""
if
file
is
None
:
with
transaction
.
atomic
():
self
.
parse
(
parentNode
,
self
.
_file
)
if
zipfile
.
is_zipfile
(
file
):
with
zipfile
.
ZipFile
(
file
)
as
zipArchive
:
for
filename
in
zipArchive
.
namelist
():
self
.
parse
(
parentNode
,
zipArchive
.
open
(
filename
,
"r"
))
else
:
self
.
_parse
(
parentNode
,
file
)
def
extract
(
self
,
parentNode
,
keys
):
"""Extract ngrams from the child nodes, given a list of field names."""
# get all the descendants of type "document"
childNodes
=
parentNode
.
descendants
()
.
filter
(
type
=
self
.
_document_nodetype
)
with
transaction
.
atomic
():
for
childNode
in
childNodes
:
# most importantly...
metadata
=
childNode
.
metadata
# which extractor shall we use?
if
language
not
in
self
.
_extractors
:
extractor
=
None
if
language
.
iso2
==
'en'
:
# use English
extractor
=
EnglishNgramsExtractor
()
elif
language
.
iso2
==
'fr'
:
# use French
extractor
=
FrenchNgramsExtractor
()
else
:
# no recognized language has been specified...
continue
self
.
_extractors
[
language
]
=
extractor
# extract ngrams from every field, find the id, count them
ngrams
=
collections
.
defaultdict
(
int
)
ngramscache
=
self
.
_ngramcaches
[
language
]
for
key
in
keys
:
for
ngram
in
extractor
.
extract_ngrams
(
text
):
ngram_text
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
ngram_id
=
ngramscache
[
ngramtext
]
.
id
ngrams
[
ngram_id
]
+=
1
# insert node/ngram associations in the database
for
ngram_id
,
occurences
in
ngrams
.
items
():
Node_Ngram
(
node_id
=
childNode
.
id
,
ngram_id
=
ngram_id
,
occurences
=
occurences
)
.
save
()
def
format_metadata_dates
(
self
,
metadata
):
"""Format the dates found in the metadata.
Examples:
{"publication_date": "2014-10-23 09:57:42"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014", ...}
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
"""
# First, check the split dates...
...
...
@@ -189,31 +62,24 @@ class FileParser:
metadata
[
prefix
+
"_minute"
]
=
date
.
strftime
(
"
%
M"
)
metadata
[
prefix
+
"_second"
]
=
date
.
strftime
(
"
%
S"
)
# finally, return the result!
# finally, return the
transformed
result!
return
metadata
def
format_metadata_languages
(
self
,
metadata
):
"""format the languages found in the metadata."""
try
:
if
"language_fullname"
in
metadata
:
language
=
self
.
_languages_fullname
[
metadata
[
"language_fullname"
]
.
lower
()]
elif
"language_iso3"
in
metadata
:
language
=
self
.
_languages_iso3
[
metadata
[
"language_iso3"
]
.
lower
()]
elif
"language_iso2"
in
metadata
:
language
=
self
.
_languages_iso2
[
metadata
[
"language_iso2"
]
.
lower
()]
else
:
return
metadata
except
KeyError
:
# the language has not been found
for
key
in
[
"language_fullname"
,
"language_iso3"
,
"language_iso2"
]:
try
:
metadata
.
pop
(
key
)
except
:
continue
return
metadata
language
=
None
for
key
in
[
"fullname"
,
"iso3"
,
"iso2"
]:
if
key
in
metadata
:
language_symbol
=
metadata
[
"language_"
+
key
]
language
=
self
.
_languages_cache
[
language_symbol
]
if
language
:
break
if
language
:
metadata
[
"language_iso2"
]
=
language
.
iso2
metadata
[
"language_iso3"
]
=
language
.
iso3
metadata
[
"language_fullname"
]
=
language
.
fullname
return
metadata
def
format_metadata
(
self
,
metadata
):
"""Format the metadata."""
metadata
=
self
.
format_metadata_dates
(
metadata
)
...
...
@@ -221,3 +87,23 @@ class FileParser:
return
metadata
def
_parse
(
self
,
file
):
"""This method shall be overriden by inherited classes."""
return
list
()
def
parse
(
self
,
file
):
"""Parse the file, and its children files found in the file.
"""
# initialize the list of metadata
metadata_list
=
[]
# is the file is a ZIP archive, recurse on each of its files...
if
zipfile
.
is_zipfile
(
file
):
zipArchive
=
zipfile
.
ZipFile
(
file
)
for
filename
in
zipArchive
.
namelist
():
metadata_list
+=
self
.
parse
(
zipArchive
.
open
(
filename
,
"r"
))
# ...otherwise, let's parse it directly!
else
:
metadata_list
+=
self
.
_parse
(
file
)
# return the list of formatted metadata
return
map
(
self
.
format_metadata
,
metadata_list
)
parsing/FileParsers/PubmedFileParser.py
View file @
755a8d4d
...
...
@@ -5,16 +5,14 @@ from parsing.NgramsExtractors import *
class
PubmedFileParser
(
FileParser
):
def
_parse
(
self
,
parentNode
,
file
):
def
_parse
(
self
,
file
):
# open the file as XML
xml_parser
=
etree
.
XMLParser
(
resolve_entities
=
False
,
recover
=
True
)
xml
=
etree
.
parse
(
file
,
parser
=
xml_parser
)
xml_articles
=
xml
.
findall
(
'PubmedArticle'
)
with
transaction
.
atomic
():
# initialize the list of documents
documents
=
[]
# initialize the list of metadata
metadata_list
=
[]
# parse all the articles, one by one
# all database operations should be performed within one transaction
for
xml_article
in
xml_articles
:
# extract data from the document
metadata
=
{}
...
...
@@ -34,15 +32,6 @@ class PubmedFileParser(FileParser):
metadata
[
key
]
=
node
.
text
except
:
metadata
[
key
]
=
""
contents
=
metadata
[
"abstract"
]
# create the document in the database
document
=
self
.
create_document
(
parentNode
=
parentNode
,
title
=
metadata
[
"title"
],
metadata
=
metadata
,
#guid = metadata["doi"],
)
if
document
:
documents
.
append
(
document
)
# return the list of documents
return
documents
metadata_list
.
append
(
metadata
)
# return the list of metadata
return
metadata_list
parsing/FileParsers/RisFileParser.py
View file @
755a8d4d
...
...
@@ -7,11 +7,11 @@ class RisFileParser(FileParser):
_parameters
=
{
}
def
_parse
(
self
,
parentNode
,
file
):
def
_parse
(
self
,
file
):
metadata_list
=
[]
metadata
=
{}
last_key
=
None
last_values
=
[]
with
transaction
.
atomic
():
for
line
in
self
.
_file
:
if
len
(
line
)
>
2
:
parameter_key
=
line
[:
2
]
...
...
@@ -23,17 +23,9 @@ class RisFileParser(FileParser):
metadata
[
parameter
[
"key"
]]
=
separator
.
join
(
last_values
)
elif
parameter
[
"type"
]
==
"delimiter"
:
language
=
self
.
_languages_fullname
[
metadata
[
"language"
]
.
lower
()]
# self.create_document(
# parentNode = parentNode,
# title = metadata["title"],
# metadata = metadata,
# guid = metadata["doi"]
# )
print
(
self
.
format_metadata
(
metadata
))
print
()
metadata
=
{}
metadata_list
.
append
(
metadata
)
last_key
=
parameter_key
last_values
=
[]
last_values
.
append
(
line
[
3
:
-
1
]
.
decode
())
self
.
_file
.
close
()
return
metadata_list
parsing/__init__.py
deleted
100644 → 0
View file @
c50f2fff
#from .Taggers import *
#from .NgramsExtractors import *
from
.FileParsers
import
*
from
node.models
import
Node
,
NodeType
import
zipfile
import
collections
# import chardet
class
Parser
:
def
__init__
(
self
):
pass
def
parse_file
(
self
,
file
):
# CHECKER GUID!!!!!!!!!!!!!!!!!!!!!!!!!!!!
pass
def
parse_node_fichier
(
self
,
node
):
if
node
.
fichier
and
zipfile
.
is_zipfile
(
node
.
fichier
):
with
zipfile
.
ZipFile
(
node
.
fichier
,
"r"
)
as
zipFile
:
node_type
=
NodeType
.
objects
.
get
(
name
=
"Document"
)
for
filename
in
zipFile
.
namelist
():
file
=
zipFile
.
open
(
filename
,
"r"
)
node
.
objects
.
create
(
parent
=
node
,
type
=
node_type
,
user
=
node
.
user
,
)
def
parse_node
(
self
,
node
):
for
resource
in
node
.
resources
:
if
node
.
resources
.
file
and
zipfile
.
is_zipfile
(
node
.
resources
.
file
):
with
zipfile
.
ZipFile
(
node
.
resources
.
file
,
"r"
)
as
zipFile
:
for
filename
in
zipFile
.
namelist
():
file
=
zipFile
.
open
(
filename
,
"r"
)
Node
.
objects
.
create
(
parent
=
node
,
type
=
NodeType
.
get
(
name
=
"Document"
),
user
=
node
.
user
,
)
def
parse_node_recursively
(
self
,
node
):
self
.
parse_node
(
node
)
for
descendant
in
node
.
get_descendants
():
self
.
parse_node
(
descendant
)
test-parsing_from_node.py
View file @
755a8d4d
from
node.models
import
Node
,
NodeType
,
User
,
Language
from
parsing.Caches
import
Cache
from
node.models
import
Node
,
NodeType
,
User
,
Language
,
ResourceType
from
parsing.Caches
import
Cache
s
try
:
me
=
User
.
objects
.
get
(
username
=
'Mat'
)
...
...
@@ -7,6 +7,12 @@ except:
me
=
User
(
username
=
'Mat'
)
me
.
save
()
try
:
typePubmed
=
ResourceType
.
get
(
name
=
'pubmed'
)
except
:
typePubmed
=
ResourceType
(
name
=
'pubmed'
)
typePubmed
.
save
()
try
:
typeCorpus
=
NodeType
.
get
(
name
=
'corpus'
)
typeDoc
=
NodeType
.
get
(
name
=
'document'
)
...
...
@@ -25,25 +31,24 @@ try:
except
:
corpus
=
Node
(
name
=
'My first corpus'
,
type
=
typeCorpus
,
user
=
me
)
corpus
.
save
()
for
i
in
range
(
64
):
title
=
'Document #
%
d'
%
i
Node
(
user
=
me
,
# type = self._document_nodetype,
name
=
title
,
language
=
english
,
metadata
=
{
'title'
:
title
},
#resource = resource,
type
=
typeDoc
,
parent
=
corpus
)
.
save
()
corpus
.
add_resource
(
file
=
'/path/to/file'
)
corpus
.
parse
()
exit
()
cache
=
Cache
()
# for i in range(64):
# title = 'Document #%d' % i
# Node(
# user = me,
# # type = self._document_nodetype,
# name = title,
# language = english,
# metadata = {'title':title},
# #resource = resource,
# type = typeDoc,
# parent = corpus
# ).save()
corpus
.
children
.
all
()
.
delete
()
corpus
.
add_resource
(
file
=
'./data_samples/pubmed.zip'
,
type
=
typePubmed
)
corpus
.
parse_resources
()
cache
=
Caches
()
for
child
in
corpus
.
children
.
all
():
print
(
child
.
id
)
child
.
extract_ngrams
([
'title'
],
cache
)
\ No newline at end of file
print
(
'#
%
d
\t
%
s
\n
%
s
\n\n
'
%
(
child
.
id
,
child
.
name
,
child
.
metadata
[
'abstract'
]))
# child.extract_ngrams(['title'], cache)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment