Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
82341b15
Commit
82341b15
authored
Jul 27, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
PARSER and CRAWLER changed with NEW NAMING CONVENTION
parent
6c5d252b
Changes
12
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1239 additions
and
20 deletions
+1239
-20
_Crawler.py
gargantext/util/crawlers/_Crawler.py
+115
-0
cern.py
gargantext/util/crawlers/cern.py
+139
-0
istex.py
gargantext/util/crawlers/istex.py
+70
-0
pubmed.py
gargantext/util/crawlers/pubmed.py
+335
-0
util.py.old
gargantext/util/crawlers/util.py.old
+210
-0
Cern.py
gargantext/util/parsers/Cern.py
+8
-1
Csv.py
gargantext/util/parsers/Csv.py
+131
-0
Istex.py
gargantext/util/parsers/Istex.py
+139
-0
Pubmed.py
gargantext/util/parsers/Pubmed.py
+1
-0
Repec.py
gargantext/util/parsers/Repec.py
+67
-0
_Parser.py
gargantext/util/parsers/_Parser.py
+4
-7
__init__.py
gargantext/util/parsers/__init__.py
+20
-12
No files found.
gargantext/util/crawlers/_Crawler.py
0 → 100644
View file @
82341b15
# Scrapers config
QUERY_SIZE_N_MAX
=
1000
from
gargantext.constants
import
get_resource
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.db
import
session
from
requests_futures.sessions
import
FuturesSession
from
gargantext.util.db
import
session
import
requests
from
gargantext.models.nodes
import
Node
#from gargantext.util.toolchain import parse_extract_indexhyperdata
from
datetime
import
date
class
Crawler
:
"""Base class for performing search and add corpus file depending on the type
"""
def
__init__
(
self
,
record
):
#the name of corpus
#that will be built in case of internal fileparsing
self
.
record
=
record
self
.
name
=
record
[
"corpus_name"
]
self
.
project_id
=
record
[
"project_id"
]
self
.
user_id
=
record
[
"user_id"
]
self
.
resource
=
record
[
"source"
]
self
.
type
=
get_resource
(
self
.
resource
)
self
.
query
=
record
[
"query"
]
#format the sampling
self
.
n_last_years
=
5
self
.
YEAR
=
date
.
today
()
.
year
#pas glop
# mais easy version
self
.
MONTH
=
str
(
date
.
today
()
.
month
)
if
len
(
self
.
MONTH
)
==
1
:
self
.
MONTH
=
"0"
+
self
.
MONTH
self
.
MAX_RESULTS
=
1000
try
:
self
.
results_nb
=
int
(
record
[
"count"
])
except
KeyError
:
#n'existe pas encore
self
.
results_nb
=
0
try
:
self
.
webEnv
=
record
[
"webEnv"
]
self
.
queryKey
=
record
[
"queryKey"
]
self
.
retMax
=
record
[
"retMax"
]
except
KeyError
:
#n'exsite pas encore
self
.
queryKey
=
None
self
.
webEnv
=
None
self
.
retMax
=
1
self
.
status
=
[
None
]
self
.
path
=
"/tmp/results.txt"
def
tmp_file
(
self
):
'''here should stored the results
depending on the type of format'''
raise
NotImplemented
def
parse_query
(
self
):
'''here should parse the parameters of the query
depending on the type and retrieve a set of activated search option
'''
raise
NotImplemented
def
fetch
(
self
):
if
self
.
download
():
self
.
create_corpus
()
return
self
.
corpus_id
def
get_sampling_dates
():
'''Create a sample list of min and max date based on Y and M f*
or N_LAST_YEARS results'''
dates
=
[]
for
i
in
range
(
self
.
n_last_years
):
maxyear
=
self
.
YEAR
-
i
mindate
=
str
(
maxyear
-
1
)
+
"/"
+
str
(
self
.
MONTH
)
maxdate
=
str
(
maxyear
)
+
"/"
+
str
(
self
.
MONTH
)
print
(
mindate
,
"-"
,
maxdate
)
dates
.
append
((
mindate
,
maxdate
))
return
dates
def
create_corpus
(
self
):
#create a corpus
corpus
=
Node
(
name
=
self
.
query
,
user_id
=
self
.
user_id
,
parent_id
=
self
.
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scrapping data"
,
"language_id"
:
self
.
type
[
"default_language"
],
}
)
self
.
corpus_id
=
corpus
.
id
if
len
(
self
.
paths
)
>
0
:
for
path
in
self
.
paths
:
#add the resource
corpus
.
add_resource
(
type
=
self
.
type
[
"type"
],
name
=
self
.
type
[
"name"
],
path
=
path
)
session
.
add
(
corpus
)
session
.
commit
()
scheduled
(
parse_extract_indexhyperdata
(
corpus
.
id
))
else
:
#add the resource
corpus
.
add_resource
(
type
=
self
.
type
[
"type"
],
name
=
self
.
type
[
"name"
],
path
=
self
.
path
)
session
.
add
(
corpus
)
session
.
commit
()
scheduled
(
parse_extract_indexhyperdata
(
corpus
.
id
))
return
corpus
gargantext/util/crawlers/cern.py
0 → 100644
View file @
82341b15
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** CERN Scrapper *****
# ****************************
# Author:c24b
# Date: 27/05/2015
from
._Crawler
import
Crawler
import
hmac
,
hashlib
import
requests
import
os
import
random
import
urllib.parse
as
uparse
from
lxml
import
etree
from
gargantext.settings
import
API_TOKENS
#from gargantext.util.files import build_corpus_path
from
gargantext.util.db
import
session
from
gargantext.models
import
Node
class
CernCrawler
(
Crawler
):
'''CERN SCOAP3 API Interaction'''
def
__generate_signature__
(
self
,
url
):
'''creation de la signature'''
#hmac-sha1 salted with secret
return
hmac
.
new
(
self
.
secret
,
url
,
hashlib
.
sha1
)
.
hexdigest
()
def
__format_query__
(
self
,
query
,
of
=
"xm"
,
fields
=
None
):
''' for query filters params
see doc https://scoap3.org/scoap3-repository/xml-api/
'''
#dict_q = uparse.parse_qs(query)
dict_q
=
{}
#by default: search by pattern
dict_q
[
"p"
]
=
query
if
fields
is
not
None
and
isinstance
(
fields
,
list
):
fields
=
","
.
join
(
fields
)
dict_q
[
"f"
]
=
fields
#outputformat: "xm", "xmt", "h", "html"
dict_q
[
"of"
]
=
of
return
dict_q
def
__format_url__
(
self
,
dict_q
):
'''format the url with encoded query'''
#add the apikey
dict_q
[
"apikey"
]
=
[
self
.
apikey
]
params
=
"&"
.
join
([(
str
(
k
)
+
"="
+
str
(
uparse
.
quote
(
v
[
0
])))
for
k
,
v
in
sorted
(
dict_q
.
items
())])
return
self
.
BASE_URL
+
params
def
sign_url
(
self
,
dict_q
):
'''add signature'''
API
=
API_TOKENS
[
"CERN"
]
self
.
apikey
=
API
[
"APIKEY"
]
self
.
secret
=
API
[
"APISECRET"
]
.
encode
(
"utf-8"
)
self
.
BASE_URL
=
u"http://api.scoap3.org/search?"
url
=
self
.
__format_url__
(
dict_q
)
return
url
+
"&signature="
+
self
.
__generate_signature__
(
url
.
encode
(
"utf-8"
))
def
create_corpus
(
self
):
#create a corpus
corpus
=
Node
(
name
=
self
.
query
,
#user_id = self.user_id,
parent_id
=
self
.
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scrapping data"
,
"language_id"
:
self
.
type
[
"default_language"
]
}
)
#add the resource
corpus
.
add_resource
(
type
=
self
.
type
[
"type"
],
name
=
self
.
type
[
"name"
],
path
=
self
.
path
)
try
:
print
(
"PARSING"
)
# p = eval(self.type["parser"])()
session
.
add
(
corpus
)
session
.
commit
()
self
.
corpus_id
=
corpus
.
id
parse_extract_indexhyperdata
(
corpus
.
id
)
return
self
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
session
.
rollback
()
return
self
def
download
(
self
):
import
time
self
.
path
=
"/tmp/results.xml"
query
=
self
.
__format_query__
(
self
.
query
)
url
=
self
.
sign_url
(
query
)
start
=
time
.
time
()
r
=
requests
.
get
(
url
,
stream
=
True
)
downloaded
=
False
#the long part
with
open
(
self
.
path
,
'wb'
)
as
f
:
print
(
"Downloading file"
)
for
chunk
in
r
.
iter_content
(
chunk_size
=
1024
):
if
chunk
:
# filter out keep-alive new chunks
#print("===")
f
.
write
(
chunk
)
downloaded
=
True
end
=
time
.
time
()
#print (">>>>>>>>>>LOAD results", end-start)
return
downloaded
def
scan_results
(
self
):
'''scanner le nombre de resultat en récupérant 1 seul résultat
qui affiche uniquement l'auteur de la page 1
on récupère le commentaire en haut de la page
'''
import
time
self
.
results_nb
=
0
query
=
self
.
__format_query__
(
self
.
query
,
of
=
"hb"
)
query
[
"ot"
]
=
"100"
query
[
"jrec"
]
=
'1'
query
[
"rg"
]
=
'1'
url
=
self
.
sign_url
(
query
)
print
(
url
)
#start = time.time()
r
=
requests
.
get
(
url
)
#end = time.time()
#print (">>>>>>>>>>LOAD results_nb", end-start)
if
r
.
status_code
==
200
:
self
.
results_nb
=
int
(
r
.
text
.
split
(
"-->"
)[
0
]
.
split
(
': '
)[
-
1
][:
-
1
])
return
self
.
results_nb
else
:
raise
ValueError
(
r
.
status
)
gargantext/util/crawlers/istex.py
0 → 100644
View file @
82341b15
from
._Crawler
import
*
import
json
class
ISTexCrawler
(
Crawler
):
"""
ISTEX Crawler
"""
def
__format_query__
(
self
,
query
=
None
):
'''formating query urlquote instead'''
if
query
is
not
None
:
query
=
query
.
replace
(
" "
,
"+"
)
return
query
else
:
self
.
query
=
self
.
query
.
replace
(
" "
,
"+"
)
return
self
.
query
def
scan_results
(
self
):
#get the number of results
self
.
results_nb
=
0
self
.
query
=
self
.
__format_query__
()
_url
=
"http://api.istex.fr/document/?q="
+
self
.
query
+
"&size=0"
#"&output=id,title,abstract,pubdate,corpusName,authors,language"
r
=
requests
.
get
(
_url
)
print
(
r
)
if
r
.
status_code
==
200
:
self
.
results_nb
=
int
(
r
.
json
()[
"total"
])
self
.
status
.
append
(
"fetching results"
)
return
self
.
results_nb
else
:
self
.
status
.
append
(
"error"
)
raise
ValueError
(
r
.
status
)
def
download
(
self
):
'''fetching items'''
downloaded
=
False
def
get_hits
(
future
):
'''here we directly get the result hits'''
response
=
future
.
result
()
if
response
.
status_code
==
200
:
return
response
.
json
()[
"hits"
]
else
:
return
None
#session = FuturesSession()
#self.path = "/tmp/results.json"
self
.
status
.
append
(
"fetching results"
)
paging
=
100
self
.
query_max
=
self
.
results_nb
if
self
.
query_max
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
self
.
query_max
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR (scrap: istex d/l ): "
,
msg
)
self
.
query_max
=
QUERY_SIZE_N_MAX
#urlreqs = []
with
open
(
self
.
path
,
'wb'
)
as
f
:
for
i
in
range
(
0
,
self
.
query_max
,
paging
):
url_base
=
"http://api.istex.fr/document/?q="
+
self
.
query
+
"&output=*&from=
%
i&size=
%
i"
%
(
i
,
paging
)
r
=
requests
.
get
(
url_base
)
if
r
.
status_code
==
200
:
downloaded
=
True
f
.
write
(
r
.
text
.
encode
(
"utf-8"
))
else
:
downloaded
=
False
self
.
status
.
insert
(
0
,
"error fetching ISTEX "
+
r
.
status
)
break
return
downloaded
gargantext/util/crawlers/pubmed.py
0 → 100644
View file @
82341b15
This diff is collapsed.
Click to expand it.
gargantext/util/crawlers/util.py.old
0 → 100644
View file @
82341b15
from gargantext.util.files import download
import sys
import time
import threading
from queue import Queue
from lxml import etree
if sys.version_info >= (3, 0):
from urllib.request import urlopen
else:
from urllib import urlopen
class Scraper :
def __init__(self):
self.queue_size = 8
self.q = Queue()
self.firstResults = []
self.lock = threading.Lock() # lock to serialize console output
self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
self.pubMedDB = 'Pubmed'
self.reportType = 'medline'
# Return the globalResults!:
# - count =
# - queryKey =
# - webEnv =
def medlineEsearch(self , query):
# print ("MedlineFetcher::medlineEsearch :")
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
# print(query)
origQuery = query
query = query.replace(' ', '%20')
eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' \
% ( self.pubMedEutilsURL, self.pubMedDB, query )
try:
eSearchResult = urlopen(eSearch)
data = eSearchResult.read()
root = etree.XML(data)
findcount = etree.XPath("/eSearchResult/Count/text()")
count = findcount(root)[0]
findquerykey = etree.XPath("/eSearchResult/QueryKey/text()")
queryKey = findquerykey(root)[0]
findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv = findwebenv(root)[0]
except Exception as Error:
print(Error)
count = 0
queryKey = False
webEnv = False
origQuery = False
values = { "query" : origQuery
, "count" : int(count)
, "queryKey" : queryKey
, "webEnv" : webEnv
}
return values
# RETMAX:
# Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
# maximum of 100,000 records
def medlineEfetchRAW( self , fullquery):
query = fullquery [ "string" ]
retmax = fullquery [ "retmax" ]
count = fullquery [ "count" ]
queryKey = fullquery [ "queryKey"]
webEnv = fullquery [ "webEnv" ]
"Fetch medline result for query 'query', saving results to file every 'retmax' articles"
queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
# print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
retstart = 0
eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
return eFetch
# generic!
def download(self, url):
print(url)
filename = download(url)
with self.lock:
print(threading.current_thread().name, filename+" OK")
return filename
# generic!
def do_work(self,item):
# time.sleep(1) # pretend to do some lengthy work.
returnvalue = self.medlineEsearch(item)
with self.lock:
# print(threading.current_thread().name, item)
return returnvalue
# The worker thread pulls an item from the queue and processes it
def worker(self):
while True:
item = self.q.get()
self.firstResults.append(self.do_work(item))
self.q.task_done()
def worker2(self):
while True:
item = self.q.get()
results = []
try:
result = self.download(item)
except Exception as error :
print(error)
result = False
self.firstResults.append(result)
self.q.task_done()
def chunks(self , l , n):
print("chunks:")
for i in range(0, len(l), n):
yield l[i:i+n]
# GLOBALLIMIT:
# I will retrieve this exact amount of publications.
# The publications per year i'll retrieve per year will be :
# (k/N)*GlobalLimit
# \_ this is used as RETMAX
# - k : Number of publications of x year (according to pubmed)
# - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
# - GlobalLimit : Number of publications i want.
def serialFetcher(self , yearsNumber , query, globalLimit):
# Create the queue and thread pool.
for i in range(self.queue_size):
t = threading.Thread(target=self.worker)
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
start = time.perf_counter()
N = 0
# print ("MedlineFetcher::serialFetcher :")
thequeries = []
globalresults = []
for i in range(yearsNumber):
year = str(2015 - i)
# print ('YEAR ' + year)
# print ('---------\n')
pubmedquery = str(year) + '[dp] '+query
self.q.put( pubmedquery ) #put task in the queue
self.q.join()
print('time:',time.perf_counter() - start)
Total = 0
Fails = 0
for globalresults in self.firstResults:
# globalresults = self.medlineEsearch(pubmedquery)
Total += 1
if globalresults["queryKey"]==False:
Fails += 1
if globalresults["count"] > 0 :
N+=globalresults["count"]
queryhyperdata = { "string" : globalresults["query"]
, "count" : globalresults["count"]
, "queryKey" : globalresults["queryKey"]
, "webEnv" : globalresults["webEnv"]
, "retmax" : 0
}
thequeries.append ( queryhyperdata )
print("Total Number:", N,"publications")
print("And i want just:",globalLimit,"publications")
print("---------------------------------------\n")
for i,query in enumerate(thequeries):
k = query["count"]
proportion = k/float(N)
retmax_forthisyear = int(round(globalLimit*proportion))
query["retmax"] = retmax_forthisyear
if query["retmax"] == 0 : query["retmax"]+=1
print(query["string"],"\t[",k,">",query["retmax"],"]")
if ((Fails+1)/(Total+1)) == 1 : # for identifying the epic fail or connection error
thequeries = [False]
return thequeries
gargantext/util/parsers/Cern.py
View file @
82341b15
...
@@ -2,6 +2,8 @@ from ._Parser import Parser
...
@@ -2,6 +2,8 @@ from ._Parser import Parser
from
datetime
import
datetime
from
datetime
import
datetime
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
lxml
import
etree
from
lxml
import
etree
#import asyncio
#q = asyncio.Queue(maxsize=0)
class
CernParser
(
Parser
):
class
CernParser
(
Parser
):
#mapping MARC21 ==> hyperdata
#mapping MARC21 ==> hyperdata
...
@@ -52,10 +54,15 @@ class CernParser(Parser):
...
@@ -52,10 +54,15 @@ class CernParser(Parser):
print
(
"Date"
,
hyperdata
[
"publication_date"
])
print
(
"Date"
,
hyperdata
[
"publication_date"
])
return
hyperdata
return
hyperdata
#@asyncio.coroutine
def
parse
(
self
,
file
):
def
parse
(
self
,
file
):
print
(
"PARSING"
)
hyperdata_list
=
[]
hyperdata_list
=
[]
doc
=
file
.
read
()
doc
=
file
.
read
()
soup
=
BeautifulSoup
(
doc
.
decode
(
"utf-8"
),
"lxml"
)
print
(
doc
[:
35
])
soup
=
BeautifulSoup
(
doc
,
"lxml"
)
#print(soup.find("record"))
for
record
in
soup
.
find_all
(
"record"
):
for
record
in
soup
.
find_all
(
"record"
):
hyperdata
=
{
v
:[]
for
v
in
self
.
MARC21
[
"100"
]
.
values
()}
hyperdata
=
{
v
:[]
for
v
in
self
.
MARC21
[
"100"
]
.
values
()}
hyperdata
[
"uid"
]
=
soup
.
find
(
"controlfield"
)
.
text
hyperdata
[
"uid"
]
=
soup
.
find
(
"controlfield"
)
.
text
...
...
gargantext/util/parsers/Csv.py
0 → 100644
View file @
82341b15
from
._Parser
import
Parser
# from ..NgramsExtractors import *
import
sys
import
csv
csv
.
field_size_limit
(
sys
.
maxsize
)
import
numpy
as
np
import
os
class
CSVParser
(
Parser
):
def
CSVsample
(
self
,
small_contents
,
delim
)
:
reader
=
csv
.
reader
(
small_contents
,
delimiter
=
delim
)
Freqs
=
[]
for
row
in
reader
:
Freqs
.
append
(
len
(
row
))
return
Freqs
def
parse
(
self
,
filebuf
):
print
(
"CSV: parsing (assuming UTF-8 and LF line endings)"
)
contents
=
filebuf
.
read
()
.
decode
(
"UTF-8"
)
.
split
(
"
\n
"
)
sample_size
=
10
sample_contents
=
contents
[
0
:
sample_size
]
hyperdata_list
=
[]
# # = = = = [ Getting delimiters frequency ] = = = = #
PossibleDelimiters
=
[
','
,
' '
,
'
\t
'
,
';'
,
'|'
,
':'
]
AllDelimiters
=
{}
for
delim
in
PossibleDelimiters
:
AllDelimiters
[
delim
]
=
self
.
CSVsample
(
sample_contents
,
delim
)
# # = = = = [ / Getting delimiters frequency ] = = = = #
# # OUTPUT example:
# # AllDelimiters = {
# # '\t': [1, 1, 1, 1, 1],
# # ' ': [1, 13, 261, 348, 330],
# # ',': [15, 15, 15, 15, 15],
# # ';': [1, 1, 1, 1, 1],
# # '|': [1, 1, 1, 1, 1]
# # }
# # = = = = [ Stand.Dev=0 & Sum of delimiters ] = = = = #
Delimiters
=
[]
for
d
in
AllDelimiters
:
freqs
=
AllDelimiters
[
d
]
suma
=
np
.
sum
(
freqs
)
if
suma
>
0
:
std
=
np
.
std
(
freqs
)
# print [ d , suma , len(freqs) , std]
if
std
==
0
:
Delimiters
.
append
(
[
d
,
suma
,
len
(
freqs
)
,
std
]
)
# # = = = = [ / Stand.Dev=0 & Sum of delimiters ] = = = = #
# # OUTPUT example:
# # Delimiters = [
# # ['\t', 5, 5, 0.0],
# # [',', 75, 5, 0.0],
# # ['|', 5, 5, 0.0]
# # ]
# # = = = = [ Delimiter selection ] = = = = #
Sorted_Delims
=
sorted
(
Delimiters
,
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
HighestDelim
=
Sorted_Delims
[
0
][
0
]
# HighestDelim = ","
print
(
"CSV selected delimiter:"
,[
HighestDelim
])
# # = = = = [ / Delimiter selection ] = = = = #
# # = = = = [ First data coordinate ] = = = = #
Coords
=
{
"row"
:
-
1
,
"column"
:
-
1
}
reader
=
csv
.
reader
(
contents
,
delimiter
=
HighestDelim
)
for
rownum
,
tokens
in
enumerate
(
reader
):
if
rownum
%
250
==
0
:
print
(
"CSV row: "
,
rownum
)
joined_tokens
=
""
.
join
(
tokens
)
if
Coords
[
"row"
]
<
0
and
len
(
joined_tokens
)
>
0
:
Coords
[
"row"
]
=
rownum
for
columnum
in
range
(
len
(
tokens
)):
t
=
tokens
[
columnum
]
if
len
(
t
)
>
0
:
Coords
[
"column"
]
=
columnum
break
# # = = = = [ / First data coordinate ] = = = = #
# # = = = = [ Setting Headers ] = = = = #
Headers_Int2Str
=
{}
reader
=
csv
.
reader
(
contents
,
delimiter
=
HighestDelim
)
for
rownum
,
tokens
in
enumerate
(
reader
):
if
rownum
>=
Coords
[
"row"
]:
for
columnum
in
range
(
Coords
[
"column"
],
len
(
tokens
)
):
t
=
tokens
[
columnum
]
Headers_Int2Str
[
columnum
]
=
t
break
# print("Headers_Int2Str")
# print(Headers_Int2Str)
# # = = = = [ / Setting Headers ] = = = = #
# # OUTPUT example:
# # Headers_Int2Str = {
# # 0: 'publication_date',
# # 1: 'publication_month',
# # 2: 'publication_second',
# # 3: 'abstract'
# # }
# # = = = = [ Reading the whole CSV and saving ] = = = = #
hyperdata_list
=
[]
reader
=
csv
.
reader
(
contents
,
delimiter
=
HighestDelim
)
for
rownum
,
tokens
in
enumerate
(
reader
):
if
rownum
>
Coords
[
"row"
]:
RecordDict
=
{}
for
columnum
in
range
(
Coords
[
"column"
],
len
(
tokens
)
):
data
=
tokens
[
columnum
]
RecordDict
[
Headers_Int2Str
[
columnum
]
]
=
data
if
len
(
RecordDict
.
keys
())
>
0
:
hyperdata_list
.
append
(
RecordDict
)
# # = = = = [ / Reading the whole CSV and saving ] = = = = #
return
hyperdata_list
gargantext/util/parsers/Istex.py
0 → 100644
View file @
82341b15
from
._Parser
import
Parser
from
datetime
import
datetime
from
io
import
BytesIO
import
json
class
ISTexParser
(
Parser
):
def
parse
(
self
,
filebuf
):
contents
=
filebuf
.
read
()
.
decode
(
"UTF-8"
)
data
=
json
.
loads
(
contents
)
filebuf
.
close
()
json_docs
=
data
[
"hits"
]
hyperdata_list
=
[]
hyperdata_path
=
{
"id"
:
"id"
,
"source"
:
'corpusName'
,
"title"
:
'title'
,
"genre"
:
"genre"
,
"language_iso3"
:
'language'
,
"doi"
:
'doi'
,
"host"
:
'host'
,
"publication_date"
:
'publicationDate'
,
"abstract"
:
'abstract'
,
# "authors" : 'author',
"authorsRAW"
:
'author'
,
"keywords"
:
"keywords"
}
suma
=
0
for
json_doc
in
json_docs
:
hyperdata
=
{}
for
key
,
path
in
hyperdata_path
.
items
():
try
:
# print(path," ==> ",len(json_doc[path]))
hyperdata
[
key
]
=
json_doc
[
path
]
except
:
pass
# print("|",hyperdata["language_iso3"])
if
"doi"
in
hyperdata
:
hyperdata
[
"doi"
]
=
hyperdata
[
"doi"
][
0
]
keywords
=
[]
if
"keywords"
in
hyperdata
:
for
keyw
in
hyperdata
[
"keywords"
]:
keywords
.
append
(
keyw
[
"value"
]
)
hyperdata
[
"keywords"
]
=
", "
.
join
(
keywords
)
moredate
=
False
moresource
=
False
if
"host"
in
hyperdata
:
if
"genre"
in
hyperdata
[
"host"
]
and
len
(
hyperdata
[
"host"
][
"genre"
])
>
0
:
if
"genre"
in
hyperdata
and
len
(
hyperdata
[
"genre"
])
==
0
:
hyperdata
[
"genre"
]
=
hyperdata
[
"host"
][
"genre"
]
# print(hyperdata["host"])
if
"pubdate"
in
hyperdata
[
"host"
]:
onebuffer
=
hyperdata
[
"publication_date"
]
hyperdata
[
"publication_date"
]
=
[]
hyperdata
[
"publication_date"
]
.
append
(
onebuffer
)
hyperdata
[
"publication_date"
]
.
append
(
hyperdata
[
"host"
][
"pubdate"
]
)
if
"title"
in
hyperdata
[
"host"
]:
hyperdata
[
"journal"
]
=
hyperdata
[
"host"
][
"title"
]
authors
=
False
if
"authorsRAW"
in
hyperdata
:
names
=
[]
for
author
in
hyperdata
[
"authorsRAW"
]:
names
.
append
(
author
[
"name"
])
hyperdata
[
"authors"
]
=
", "
.
join
(
names
)
if
"host"
in
hyperdata
:
hyperdata
.
pop
(
"host"
)
if
"genre"
in
hyperdata
:
if
len
(
hyperdata
[
"genre"
])
==
0
:
hyperdata
.
pop
(
"genre"
)
if
"language_iso3"
in
hyperdata
:
# retrieve lang if lang != [] and lang != ["unknown"]
# ---------------------------------------------------
if
len
(
hyperdata
[
"language_iso3"
])
>
0
and
hyperdata
[
"language_iso3"
][
0
]
!=
"unknown"
:
hyperdata
[
"language_iso3"
]
=
hyperdata
[
"language_iso3"
][
0
]
# default value = eng
# possible even better: langid.classify(abstract)
else
:
# NB 97% des docs istex sont eng donc par défaut
# ----------------------------------------------
hyperdata
[
"language_iso3"
]
=
"eng"
# (cf. api.istex.fr/document/?q=*&facet=language
# et tests langid sur les language=["unknown"])
if
"publication_date"
in
hyperdata
:
RealDate
=
hyperdata
[
"publication_date"
]
if
"publication_date"
in
hyperdata
:
hyperdata
.
pop
(
"publication_date"
)
if
isinstance
(
RealDate
,
list
):
RealDate
=
RealDate
[
0
]
# print( RealDate ," | length:",len(RealDate))
Decision
=
""
if
len
(
RealDate
)
>
4
:
if
len
(
RealDate
)
>
8
:
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y-
%
b-
%
d'
)
.
date
()
except
:
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y-
%
m-
%
d'
)
.
date
()
except
:
Decision
=
False
else
:
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y-
%
b'
)
.
date
()
except
:
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y-
%
m'
)
.
date
()
except
:
Decision
=
False
else
:
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y'
)
.
date
()
except
:
Decision
=
False
if
Decision
!=
False
:
hyperdata
[
"publication_year"
]
=
str
(
Decision
.
year
)
hyperdata
[
"publication_month"
]
=
str
(
Decision
.
month
)
hyperdata
[
"publication_day"
]
=
str
(
Decision
.
day
)
hyperdata_list
.
append
(
hyperdata
)
# print("\t||",hyperdata["title"])
# print("\t\t",Decision)
# print("=============================")
# else:
# suma+=1
# if "pubdate" in json_doc:
# print ("\tfail pubdate:",json_doc["pubdate"])
# print ("nb_hits:",len(json_docs))
# print("\t - nb_fails:",suma)
# print(" -- - - - - - -- - -")
return
hyperdata_list
gargantext/util/parsers/Pubmed.py
View file @
82341b15
...
@@ -31,6 +31,7 @@ class PubmedParser(Parser):
...
@@ -31,6 +31,7 @@ class PubmedParser(Parser):
if
isinstance
(
file
,
bytes
):
if
isinstance
(
file
,
bytes
):
file
=
BytesIO
(
file
)
file
=
BytesIO
(
file
)
xml
=
etree
.
parse
(
file
,
parser
=
self
.
xml_parser
)
xml
=
etree
.
parse
(
file
,
parser
=
self
.
xml_parser
)
#print(xml.find("PubmedArticle"))
xml_articles
=
xml
.
findall
(
'PubmedArticle'
)
xml_articles
=
xml
.
findall
(
'PubmedArticle'
)
# initialize the list of hyperdata
# initialize the list of hyperdata
hyperdata_list
=
[]
hyperdata_list
=
[]
...
...
gargantext/util/parsers/Repec.py
0 → 100644
View file @
82341b15
from
._Parser
import
Parser
from
gargantext.util.languages
import
languages
#from admin.utils import PrintException
class
RepecParser
(
Parser
):
# def __init__(self, language_cache=None):
#
# #super(Parser, self).__init__()
# #super(Parser, self).__init__()
# self._languages_cache = LanguagesCache() if language_cache is None else language_cache
_begin
=
6
_parameters
=
{
b
"ER"
:
{
"type"
:
"delimiter"
},
b
"T1"
:
{
"type"
:
"hyperdata"
,
"key"
:
"title"
,
"separator"
:
" "
},
b
"ST"
:
{
"type"
:
"hyperdata"
,
"key"
:
"subtitle"
,
"separator"
:
" "
},
b
"A1"
:
{
"type"
:
"hyperdata"
,
"key"
:
"authors"
,
"separator"
:
"
\n
"
},
b
"JO"
:
{
"type"
:
"hyperdata"
,
"key"
:
"journal"
},
b
"UR"
:
{
"type"
:
"hyperdata"
,
"key"
:
"doi"
},
b
"Y1"
:
{
"type"
:
"hyperdata"
,
"key"
:
"publication_year"
},
b
"PD"
:
{
"type"
:
"hyperdata"
,
"key"
:
"publication_month"
},
b
"N1"
:
{
"type"
:
"hyperdata"
,
"key"
:
"references"
,
"separator"
:
", "
},
b
"LA"
:
{
"type"
:
"hyperdata"
,
"key"
:
"language_iso2"
},
b
"N2"
:
{
"type"
:
"hyperdata"
,
"key"
:
"abstract"
,
"separator"
:
" "
},
b
"WC"
:
{
"type"
:
"hyperdata"
,
"key"
:
"fields"
},
}
def
parse
(
self
,
file
):
hyperdata
=
{}
last_key
=
None
last_values
=
[]
# browse every line of the file
for
line
in
file
:
if
len
(
line
)
>
2
:
# extract the parameter key
parameter_key
=
line
[:
2
]
if
parameter_key
!=
b
' '
and
parameter_key
!=
last_key
:
if
last_key
in
self
.
_parameters
:
# translate the parameter key
parameter
=
self
.
_parameters
[
last_key
]
if
parameter
[
"type"
]
==
"hyperdata"
:
separator
=
parameter
[
"separator"
]
if
"separator"
in
parameter
else
""
if
parameter
[
"key"
]
==
"publication_year"
:
hyperdata
[
parameter
[
"key"
]]
=
separator
.
join
(
last_values
)[:
4
]
else
:
hyperdata
[
parameter
[
"key"
]]
=
separator
.
join
(
last_values
)
elif
parameter
[
"type"
]
==
"delimiter"
:
if
'language_fullname'
not
in
hyperdata
.
keys
():
if
'language_iso3'
not
in
hyperdata
.
keys
():
if
'language_iso2'
not
in
hyperdata
.
keys
():
hyperdata
[
'language_iso2'
]
=
'en'
yield
hyperdata
hyperdata
=
{}
last_key
=
parameter_key
last_values
=
[]
try
:
last_values
.
append
(
line
[
self
.
_begin
:
-
1
]
.
decode
())
except
Exception
as
error
:
print
(
error
)
# if a hyperdata object is left in memory, yield it as well
if
hyperdata
:
yield
hyperdata
gargantext/util/parsers/_Parser.py
View file @
82341b15
...
@@ -20,14 +20,9 @@ class Parser:
...
@@ -20,14 +20,9 @@ class Parser:
self
.
_file
=
file
self
.
_file
=
file
def
__del__
(
self
):
def
__del__
(
self
):
self
.
_file
.
close
()
if
hasattr
(
self
,
'_file'
):
self
.
_file
.
close
()
def
detect_format
(
self
,
afile
,
a_formats
):
#import magic
print
(
"Detecting format"
)
#print(magic.from_file(afile))
return
def
detect_encoding
(
self
,
string
):
def
detect_encoding
(
self
,
string
):
"""Useful method to detect the encoding of a document.
"""Useful method to detect the encoding of a document.
...
@@ -167,6 +162,8 @@ class Parser:
...
@@ -167,6 +162,8 @@ class Parser:
def
__iter__
(
self
,
file
=
None
):
def
__iter__
(
self
,
file
=
None
):
"""Parse the file, and its children files found in the file.
"""Parse the file, and its children files found in the file.
C24B comment: le stokage/extraction du fichier devrait être faite en amont
et cette methode est un peu obscure
"""
"""
if
file
is
None
:
if
file
is
None
:
file
=
self
.
_file
file
=
self
.
_file
...
...
gargantext/util/parsers/__init__.py
View file @
82341b15
from
.Ris
import
RISParser
import
importlib
from
.Ris_repec
import
RepecParser
from
gargantext.constants
import
RESOURCETYPES
from
.Isi
import
ISIParser
from
gargantext.settings
import
DEBUG
# from .Jstor import JstorParser
#if DEBUG:
# from .Zotero import ZoteroParser
# print("Loading available PARSERS:")
from
.Pubmed
import
PubmedParser
base_parser
=
"gargantext.util.parsers"
for
resource
in
RESOURCETYPES
:
if
resource
[
"parser"
]
is
not
None
:
#parser file is without Parser
try
:
fname
=
resource
[
"parser"
]
.
replace
(
"Parser"
,
""
)
#parser file is formatted as a title
module
=
base_parser
+
".
%
s"
%
(
fname
.
title
())
#parser module is has shown in constants
parser
=
importlib
.
import_module
(
module
)
#if DEBUG:
# print("\t-", resource["parser"])
#getattr(parser,resource["parser"])
# # 2015-12-08: parser 2 en 1
except
Exception
as
e
:
from
.Europress
import
EuropressParser
print
(
"Check constants.py
%
s
\n
LANGUAGES declaration of taggers. Parser
%
s is not available"
%
(
str
(
e
),
resource
[
"parser"
]))
from
.ISTex
import
ISTexParser
from
.CSV
import
CSVParser
from
.Cern
import
CernParser
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment