Commit 34bdeee2 authored by c24b's avatar c24b

NAMING convention for PARSER

parent 05ac0731
......@@ -224,21 +224,24 @@ PARSERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["parser"] is not No
CRAWLERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["crawler"] is not None]
def load_parser(resource):
'''given a resource load the corresponding Crawler
'''given a resource load the corresponding Parser
resource(dict) > Parser(object)
exemple with resource ISTexParser
PARSER filename: ISTEX
PARSER object: ISTexParser
'''
if resource["parser"] is not None:
filename = resource["parser"].replace("Parser", '')
print(filename)
module = 'gargantext.util.parsers.%s' %(filename)
module = importlib.import_module(module)
return getattr(module, resource["parser"])
else:
return None
filename = resource["parser"].replace("Parser", '').upper()
module = 'gargantext.util.parsers.%s' %(filename)
module = importlib.import_module(module)
return getattr(module, resource["parser"])
def load_crawler(resource):
'''given a resource load the corresponding Parser()
resource(dict) > Parser(object)
exemple with resource ISTexCrawler
PARSER filename: ISTEX
PARSER object: ISTexCrawler
'''
filename = resource["name"].replace("Crawler", "")
module = 'gargantext.util.crawlers.%s' %(filename)
......
from ._Parser import Parser
# from ..NgramsExtractors import *
import sys
import csv
csv.field_size_limit(sys.maxsize)
import numpy as np
import os
class CSVParser(Parser):
def CSVsample( self, small_contents , delim) :
reader = csv.reader(small_contents, delimiter=delim)
Freqs = []
for row in reader:
Freqs.append(len(row))
return Freqs
def parse(self, filebuf):
print("CSV: parsing (assuming UTF-8 and LF line endings)")
contents = filebuf.read().decode("UTF-8").split("\n")
sample_size = 10
sample_contents = contents[0:sample_size]
hyperdata_list = []
# # = = = = [ Getting delimiters frequency ] = = = = #
PossibleDelimiters = [ ',',' ','\t', ';', '|', ':' ]
AllDelimiters = {}
for delim in PossibleDelimiters:
AllDelimiters[delim] = self.CSVsample( sample_contents , delim )
# # = = = = [ / Getting delimiters frequency ] = = = = #
# # OUTPUT example:
# # AllDelimiters = {
# # '\t': [1, 1, 1, 1, 1],
# # ' ': [1, 13, 261, 348, 330],
# # ',': [15, 15, 15, 15, 15],
# # ';': [1, 1, 1, 1, 1],
# # '|': [1, 1, 1, 1, 1]
# # }
# # = = = = [ Stand.Dev=0 & Sum of delimiters ] = = = = #
Delimiters = []
for d in AllDelimiters:
freqs = AllDelimiters[d]
suma = np.sum( freqs )
if suma >0:
std = np.std( freqs )
# print [ d , suma , len(freqs) , std]
if std == 0:
Delimiters.append ( [ d , suma , len(freqs) , std] )
# # = = = = [ / Stand.Dev=0 & Sum of delimiters ] = = = = #
# # OUTPUT example:
# # Delimiters = [
# # ['\t', 5, 5, 0.0],
# # [',', 75, 5, 0.0],
# # ['|', 5, 5, 0.0]
# # ]
# # = = = = [ Delimiter selection ] = = = = #
Sorted_Delims = sorted(Delimiters, key=lambda x: x[1], reverse=True)
HighestDelim = Sorted_Delims[0][0]
# HighestDelim = ","
print("CSV selected delimiter:",[HighestDelim])
# # = = = = [ / Delimiter selection ] = = = = #
# # = = = = [ First data coordinate ] = = = = #
Coords = {
"row": -1,
"column": -1
}
reader = csv.reader(contents, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
if rownum % 250 == 0:
print("CSV row: ", rownum)
joined_tokens = "".join (tokens)
if Coords["row"]<0 and len( joined_tokens )>0 :
Coords["row"] = rownum
for columnum in range(len(tokens)):
t = tokens[columnum]
if len(t)>0:
Coords["column"] = columnum
break
# # = = = = [ / First data coordinate ] = = = = #
# # = = = = [ Setting Headers ] = = = = #
Headers_Int2Str = {}
reader = csv.reader(contents, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
if rownum>=Coords["row"]:
for columnum in range( Coords["column"],len(tokens) ):
t = tokens[columnum]
Headers_Int2Str[columnum] = t
break
# print("Headers_Int2Str")
# print(Headers_Int2Str)
# # = = = = [ / Setting Headers ] = = = = #
# # OUTPUT example:
# # Headers_Int2Str = {
# # 0: 'publication_date',
# # 1: 'publication_month',
# # 2: 'publication_second',
# # 3: 'abstract'
# # }
# # = = = = [ Reading the whole CSV and saving ] = = = = #
hyperdata_list = []
reader = csv.reader(contents, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
if rownum>Coords["row"]:
RecordDict = {}
for columnum in range( Coords["column"],len(tokens) ):
data = tokens[columnum]
RecordDict[ Headers_Int2Str[columnum] ] = data
if len(RecordDict.keys())>0:
hyperdata_list.append( RecordDict )
# # = = = = [ / Reading the whole CSV and saving ] = = = = #
return hyperdata_list
from ._Parser import Parser
from datetime import datetime
from io import BytesIO
import json
class ISTexParser(Parser):
def parse(self, filebuf):
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
filebuf.close()
json_docs = data["hits"]
hyperdata_list = []
hyperdata_path = {
"id" : "id",
"source" : 'corpusName',
"title" : 'title',
"genre" : "genre",
"language_iso3" : 'language',
"doi" : 'doi',
"host" : 'host',
"publication_date" : 'publicationDate',
"abstract" : 'abstract',
# "authors" : 'author',
"authorsRAW" : 'author',
"keywords" : "keywords"
}
suma = 0
for json_doc in json_docs:
hyperdata = {}
for key, path in hyperdata_path.items():
try:
# print(path," ==> ",len(json_doc[path]))
hyperdata[key] = json_doc[path]
except:
pass
# print("|",hyperdata["language_iso3"])
if "doi" in hyperdata:
hyperdata["doi"] = hyperdata["doi"][0]
keywords = []
if "keywords" in hyperdata:
for keyw in hyperdata["keywords"]:
keywords.append(keyw["value"] )
hyperdata["keywords"] = ", ".join( keywords )
moredate=False
moresource=False
if "host" in hyperdata:
if "genre" in hyperdata["host"] and len(hyperdata["host"]["genre"])>0:
if "genre" in hyperdata and len(hyperdata["genre"])==0:
hyperdata["genre"] = hyperdata["host"]["genre"]
# print(hyperdata["host"])
if "pubdate" in hyperdata["host"]:
onebuffer = hyperdata["publication_date"]
hyperdata["publication_date"] = []
hyperdata["publication_date"].append(onebuffer)
hyperdata["publication_date"].append( hyperdata["host"]["pubdate"] )
if "title" in hyperdata["host"]:
hyperdata["journal"] = hyperdata["host"]["title"]
authors=False
if "authorsRAW" in hyperdata:
names = []
for author in hyperdata["authorsRAW"]:
names.append(author["name"])
hyperdata["authors"] = ", ".join(names)
if "host" in hyperdata: hyperdata.pop("host")
if "genre" in hyperdata:
if len(hyperdata["genre"])==0:
hyperdata.pop("genre")
if "language_iso3" in hyperdata:
# retrieve lang if lang != [] and lang != ["unknown"]
# ---------------------------------------------------
if len(hyperdata["language_iso3"])>0 and hyperdata["language_iso3"][0] != "unknown" :
hyperdata["language_iso3"] = hyperdata["language_iso3"][0]
# default value = eng
# possible even better: langid.classify(abstract)
else:
# NB 97% des docs istex sont eng donc par défaut
# ----------------------------------------------
hyperdata["language_iso3"] = "eng"
# (cf. api.istex.fr/document/?q=*&facet=language
# et tests langid sur les language=["unknown"])
if "publication_date" in hyperdata:
RealDate = hyperdata["publication_date"]
if "publication_date" in hyperdata:
hyperdata.pop("publication_date")
if isinstance(RealDate, list):
RealDate = RealDate[0]
# print( RealDate ," | length:",len(RealDate))
Decision=""
if len(RealDate)>4:
if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y-%b-%d').date()
except:
try: Decision = datetime.strptime(RealDate, '%Y-%m-%d').date()
except: Decision=False
else:
try: Decision = datetime.strptime(RealDate, '%Y-%b').date()
except:
try: Decision = datetime.strptime(RealDate, '%Y-%m').date()
except: Decision=False
else:
try: Decision = datetime.strptime(RealDate, '%Y').date()
except: Decision=False
if Decision!=False:
hyperdata["publication_year"] = str(Decision.year)
hyperdata["publication_month"] = str(Decision.month)
hyperdata["publication_day"] = str(Decision.day)
hyperdata_list.append(hyperdata)
# print("\t||",hyperdata["title"])
# print("\t\t",Decision)
# print("=============================")
# else:
# suma+=1
# if "pubdate" in json_doc:
# print ("\tfail pubdate:",json_doc["pubdate"])
# print ("nb_hits:",len(json_docs))
# print("\t - nb_fails:",suma)
# print(" -- - - - - - -- - -")
return hyperdata_list
from ._Parser import Parser
from gargantext.util.languages import languages
#from admin.utils import PrintException
class RepecParser(Parser):
# def __init__(self, language_cache=None):
#
# #super(Parser, self).__init__()
# #super(Parser, self).__init__()
# self._languages_cache = LanguagesCache() if language_cache is None else language_cache
_begin = 6
_parameters = {
b"ER": {"type": "delimiter"},
b"T1": {"type": "hyperdata", "key": "title", "separator": " "},
b"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "},
b"A1": {"type": "hyperdata", "key": "authors", "separator": "\n"},
b"JO": {"type": "hyperdata", "key": "journal"},
b"UR": {"type": "hyperdata", "key": "doi"},
b"Y1": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"},
b"N1": {"type": "hyperdata", "key": "references", "separator": ", "},
b"LA": {"type": "hyperdata", "key": "language_iso2"},
b"N2": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
def parse(self, file):
hyperdata = {}
last_key = None
last_values = []
# browse every line of the file
for line in file:
if len(line) > 2 :
# extract the parameter key
parameter_key = line[:2]
if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters:
# translate the parameter key
parameter = self._parameters[last_key]
if parameter["type"] == "hyperdata":
separator = parameter["separator"] if "separator" in parameter else ""
if parameter["key"] == "publication_year":
hyperdata[parameter["key"]] = separator.join(last_values)[:4]
else:
hyperdata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter":
if 'language_fullname' not in hyperdata.keys():
if 'language_iso3' not in hyperdata.keys():
if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en'
yield hyperdata
hyperdata = {}
last_key = parameter_key
last_values = []
try:
last_values.append(line[self._begin:-1].decode())
except Exception as error:
print(error)
# if a hyperdata object is left in memory, yield it as well
if hyperdata:
yield hyperdata
......@@ -3,7 +3,7 @@ When started, it initiates the parser;
when passed text, the text is piped to the parser.
When ended, the parser is closed and the tagged word returned as a tuple.
"""
from constants import RULE_JJNN, DEFAULT_MAX_NGRAM_LEN
from gargantext.constants import RULE_JJNN, DEFAULT_MAX_NGRAM_LEN
import re
import nltk
......@@ -50,6 +50,7 @@ class Tagger:
This method is called by the constructor, and can be overriden by
inherited classes.
"""
print("START")
self.extract(self.text)
def stop(self):
......
......@@ -26,10 +26,11 @@ def prepare_stemmers(corpus):
# always get a generic stemmer in case language code unknown
'__unknown__' : SnowballStemmer("english")
}
for lgiso2 in corpus.hyperdata['languages'].keys():
if (lgiso2 != '__skipped__'):
lgname = languages[lgiso2].name.lower()
stemmers_by_lg[lgiso2] = SnowballStemmer(lgname)
for lang in corpus.hyperdata["languages"].keys():
print(lang)
if (lang != '__skipped__'):
lgname = languages[lang].name.lower()
stemmers_by_lg[lang] = SnowballStemmer(lgname)
return stemmers_by_lg
def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
......@@ -38,6 +39,7 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2)
"""
print(corpus.languages.keys())
stop_ngrams_ids = {}
# we will need the ngrams of the stoplist to filter
......
......@@ -53,19 +53,21 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
#load available taggers for source default langage
tagger_bots = {lang: load_tagger(lang) for lang in source['default_languages']}
#skipped documents that have been skipped previously for parsing error or unsupported language
print(corpus.skipped_docs)
docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.skipped_docs]
#sort docs by lang?
docs = sorted(docs, key= lambda k: k.language_iso2)
#print(corpus.hyperdata["languages"])
for documents_count, document in enumerate(docs):
lang_doc = document.hyperdata['language_iso2']
ngramextractor = tagger_bots[lang_doc]
lang_doc = document.language_iso2
print(lang_doc)
for key in keys:
value = document.hyperdata.get(key, None)
if not isinstance(value, str):
continue
# get ngrams
for ngram in ngramsextractor.extract(value):
for ngram in tagger_bots[lang_doc](value):
tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
......@@ -93,11 +95,12 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
corpus.status('Ngrams', progress=documents_count+1)
corpus.save_hyperdata()
session.commit()
# integrate ngrams and nodes-ngrams
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
corpus.status('Ngrams', progress=documents_count+1, complete=True)
corpus.save_hyperdata()
session.commit()
else:
# integrate ngrams and nodes-ngrams
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
corpus.status('Ngrams', progress=documents_count+1, complete=True)
corpus.save_hyperdata()
session.commit()
except Exception as error:
corpus.status('Ngrams', error=error)
corpus.save_hyperdata()
......
......@@ -9,24 +9,31 @@ def parse(corpus):
try:
documents_count = 0
corpus.status('Docs', progress=0)
#print(corpus.resources())
#get the sources capabilities for a given corpus
sources = [get_resource(resource["type"]) for resource in corpus.resources() if not 'extracted' in resource.keys() ]
resource = corpus.resources()[0]
print(resource)
sources = [get_resource(resource["type"]) for resource in corpus.resources()]
print(sources)
if len(sources) == 0:
#>>> documents have already been parsed?????
return
raise ValueError(len(sources))
if len(sources) > 0:
#>>> necessairement 1 corpus = 1 source dans l'archi actuelle
source = sources[0]
if resource["parser"] is None:
if source["parser"] is None:
#corpus.status(error)
raise ValueError("Resource '%s' has no Parser" %resource["name"])
else:
corpus.languages = defaultdict.from_keys(sources[0]["default_languages"], 0)
corpus.languages = defaultdict.fromkeys(sources[0]["default_languages"], 0)
corpus.skipped_docs = []
session.add(corpus)
session.commit()
#load the corresponding parser
resource_parser = load_parser(source)
skipped_languages = []
# extract and insert documents from resource.path into database
print(resource)
for hyperdata in resource_parser(resource["path"]):
# indexed text fields defined in constants
for k in DEFAULT_INDEX_FIELDS:
......@@ -39,7 +46,7 @@ def parse(corpus):
# a simple census to raise language info at corpus level
if "language_iso2" in hyperdata.keys():
try:
corpus.languages[hyperdata["language_iso2"]] += 1
corpus.hyperdata["languages"][hyperdata["language_iso2"]] += 1
except KeyError:
hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"])
......@@ -47,8 +54,6 @@ def parse(corpus):
hyperdata["error"] = "Error: no language found"
# save as DB child
# ----------------
document = corpus.add_child(
typename = 'DOCUMENT',
name = hyperdata.get('title', '')[:255],
......@@ -60,21 +65,21 @@ def parse(corpus):
document.status('Parsing', error= document.hyperdata["error"])
#session.delete(document)
corpus.skipped_docs.append(document.id)
# logging
if documents_count % BATCH_PARSING_SIZE == 0:
corpus.status('Docs', progress=documents_count)
corpus.save_hyperdata()
session.add(corpus)
session.commit()
documents_count += 1
# update info about the resource
resource['extracted'] = True
# add a corpus-level info about languages adding a __skipped__ info
corpus.hyperdata['languages']['__skipped__'] = Counter(skipped_languages)
corpus.languages['__skipped__'] = Counter(skipped_languages)
# commit all changes
corpus.status('Docs', progress=documents_count, complete=True)
corpus.save_hyperdata()
session.add(corpus)
session.commit()
except Exception as error:
corpus.status('Docs', error=error)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment