Commit 5635c954 authored by Anne-Laure Thomas Derepas's avatar Anne-Laure Thomas Derepas

Merge branch 'dev' into 'master'

Dev

See merge request athomas/gargantexternal-tools!13
parents 41e00418 22967fb8
#!/usr/bin/env python
# coding: utf-8
# In[74]:
import json
import pandas as pd
import numpy as np
import glob
import sys
import datetime
# In[80]:
import zipfile
# In[83]:
input_file=sys.argv[1]
output_file=sys.argv[2]
# In[84]:
#list_articles=glob.glob("tmp/*/*.json")
# In[85]:
output=[]
with zipfile.ZipFile(input_file, 'r') as zip_ref:
for file in zip_ref.namelist():
if file.split('.')[1] != 'json' or file.split('.')[0] == 'manifest':
continue
try:
article=json.load(zip_ref.open(file))
temp={}
temp["title"]=article.get("title","").encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ")
temp["abstract"]=article.get("abstract","").encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ")
authors=""
for author in article.get("author",[]):
authors+=author["name"]+", "
authors=authors[:-2]
temp["code"] = article.get("_id")
temp["authors"]=authors.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ").replace(",", ";")
temp["source"]=article["host"]["title"].encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ")
temp["publication_year"]=article.get("publicationDate", datetime.date.today().year)
temp["publication_month"]=1
temp["publication_day"]=1
output.append(temp)
except Exception as e:
print(file,e)
# In[86]:
output=pd.DataFrame(output)
tmp = output.size
duplicated = output['title'].str.lower().replace(",", "", regex=True).duplicated()
if (duplicated.any()):
print("\nQuelques fichiers n'ont pas été introduits dans le TSV car ils pourraient apparaitre plusieurs fois:")
for i in range(0, output["title"].size - 1):
if (duplicated[i]):
print("\t" + output["code"][i] + " " + output["title"][i])
output.drop(['code'], axis=1)
output = output[~duplicated]
# In[87]:
output.to_csv(output_file,sep='\t',index=False)
print("")
# In[ ]:
# IstexToGargantext
## About The project
IstexToGargantext convert a zip file from Istex into a TSV file for GarganText
## Usage
```shell
python3 Istex2ggtx.py file.zip
```
## Date
This script have been last updated the 2023/07/24.
It can be outdated if the futur.
# ZoteroToGargantext
## About The project
ZoteroToGarganText isn't usable right now, need modification to transform txt and pdf file from zotero into tsv for gargantext
## Usage
```shell
python3 ZoteroToGarganText.py
```
from pyzotero import zotero
from datetime import date
def getDataFromWebPage(item):
# Title
title = item['data']['title']
# Authors
if 'creators' in item['data'].keys():
authors = []
for author in item['data']['creators']:
authors.append(author['lastName'])
authors = ';'.join(authors)
else:
authors = ''
# Source
source = item['data']['url']
# Abstract
if 'abstractNote' in item['data'].keys():
abstract = item['data']['abstractNote']
else:
abstract = ''
# Date
if 'date' in item['data'].keys() and item['data']['date'] != '':
pdate = item['data']['date'].split('-')
pdate[2] = pdate[2].split('T')[0]
pdate = '\t'.join(pdate)
else:
pdate = str(date.today().year) + '\t1\t1'
abstract = abstract.encode(encoding='UTF-8', errors='ignore').decode(
"utf-8").replace('\t', '').replace('"', '').replace('\n', '')
title = title.encode(encoding='UTF-8', errors='ignore').decode(
"utf-8").replace('\t', '').replace('"', '').replace('\n', '')
source = source.encode(encoding='UTF-8', errors='ignore').decode(
"utf-8").replace('\t', '').replace('"', '').replace('\n', '').replace('\n', '')
# Output
return str(title) + "\t" + source + "\t" + str(pdate) + "\t" + abstract + "\t" + authors + "\t" + str(1) + "\n"
def makeTSV(items):
txt = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
for item in items:
if item['data']['itemType'] in ['webpage', 'encyclopediaArticle', 'blogPost']:
txt += getDataFromWebPage(item)
elif item['data']['itemType'] == 'attachment':
#with open('tmp/' + item['data']['title'], 'wb') as f:
# f.write(zot.file(item['data']['key']))
print(item)
else:
print("??")
#print(item['data']['itemType'])
with open('output.tsv', 'w') as f:
f.write(txt)
print("Id:")
id = input()
zot = zotero.Zotero(id, 'user')
print("Items (i)/ Collection (c)")
t = input()
if t == 'i':
print('Search :')
search = input()
zot.add_parameters(q=search)
items = zot.top()
else:
docs = zot.collections()
tmp = {}
print('Collection :')
for doc in docs:
tmp[doc['data']['name']] = doc['data']['key']
print(doc['data']['name'])
print("choose collection")
col = input()
items = []
for elem in col.split(' '):
items += zot.collection_items(tmp[elem])
txt = makeTSV(items)
import requests as req
import json
import sys
from datetime import date
# python3 IsidoreAPIToGarganText search nb_replies language
# ex : python3 IsidoreAPIToGarganText "brain muscle" 100 fra
try :
search = sys.argv[1]
replies = sys.argv[2]
language = sys.argv[3]
except :
print ("! args error\n")
sys.exit(0)
if replies > 1000:
print("The number of replier must be less than 1000")
sys.exit(0)
url = 'https://api.isidore.science/resource/search?q=' + search + '&output=json&replies=' + replies + '&language=http://lexvo.org/id/iso639-3/' + language
resp = req.get(url)
jsontxt = json.loads(resp.content)
docs = jsontxt["response"]["replies"]["content"]["reply"]
# Output File
output = open("output.csv", "w")
header = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
output.write(header)
for doc in docs:
# Title
title = doc["isidore"]["title"]
if (type(title) != str):
if(type(title) == list):
tmp = ''
for lang in title:
if type(lang) != str and lang['@xml:lang'] == language[:2]:
tmp = lang['$']
if tmp == '':
if type(title[0]) == str:
title = title[0]
else:
title = title[0]['$']
else:
title = tmp
else:
title = title['$']
# Source
source =doc["isidore"]["source_info"]["sourceName"]["$"]
# Author
if doc['isidore']['enrichedCreators'] != []:
list_author = doc["isidore"]["enrichedCreators"]["creator"]
authors = []
if(type(list_author) == list):
for author in list_author:
authors.append(author["@origin"])
authors = ';'.join(authors)
else:
authors = list_author["@origin"]
else:
authors = ''
#Abstract
if 'abstract' in doc['isidore'].keys() and doc["isidore"]["abstract"] != []:
abstract = doc["isidore"]["abstract"]
else:
abstract = ''
if (type(abstract) != str):
if type(abstract) == list:
tmp = ''
for lang in abstract:
if type(lang) != str and lang['@xml:lang'] == language[:2]:
tmp = lang['$']
if tmp == '':
if type(abstract[0]) == str:
abstract = abstract[0]
else:
abstract = abstract[0]['$']
else:
abstract = tmp
else :
abstract = abstract['$']
# Publication Date
try:
pdate = '\t'.join(doc["isidore"]["date"]["normalizedDate"].split('-'))
except Exception as e:
pdate = str(date.today().year) + '\t01\t01'
abstract = abstract.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace('\t', '').replace('"', '')
title = title.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace('\t', '').replace('"', '')
source = source.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace('\t', '').replace('"', '').replace('\n', '')
# Output
row = str(title) + "\t" + source + "\t" + pdate + "\t" + abstract + "\t" + authors + "\t" + str(1) + "\n"
output.write(row)
# IsidoreAPIToGargantext
## About The project
IsidoreAPIToGargantext call isidore API to make a research using the parameter given and create a TSV file usable in GraganText
## Usage
```shell
python3 IsidoreAPIToGargantext.py search replies lang
```
search is what you want tu search in Isidore
replies is the number of reply take fron the answer of Isidore
lang is the language (see note)
Output a TSV legacy corpus named output.tsv
## Date
This script have been last updated the 2023/07/24.
It can be outdated if the futur.
## Note
language | lang | work?
| :--- |:--- |:---
French | fra | fine
English | eng | fine
Deutch | deu | fine
Spanish | spa | fine
Italian | ita | fine
Portuguese | por | fine
Polish | nld | low answer
Russian | rus | low answer
Chiniese | lzh | should work but don't actually
\ No newline at end of file
...@@ -79,3 +79,7 @@ is_section = true ...@@ -79,3 +79,7 @@ is_section = true
path = "pages/Merge_Term_GarganText.py" path = "pages/Merge_Term_GarganText.py"
name = "Merge GarganText Terms" name = "Merge GarganText Terms"
[[pages]]
path = "pages/GEXF_To_TermOcc.py"
name = "GEXF To Term"
locale,key,value
fr,title,"# Term / Occurrence"
en,title,"# Json To TSV"
fr,text,"Transforme un fichier GEXF venant du graphe de GarganText en un fichier TSV de terme et d'occurrence."
en,text,"Transform a GEXF file of a graph from GarganText to a TSV file of term and occurrence."
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,new_file,"Téléchargez votre fichier TSV :"
en,new_file,"Download your TSV file:"
fr,error,"Erreur : le fichier n'est pas valide"
en,error,"Error : the file isn't valid"
\ No newline at end of file
...@@ -2,7 +2,7 @@ locale,key,value ...@@ -2,7 +2,7 @@ locale,key,value
fr,title,"# Isidore vers GarganText" fr,title,"# Isidore vers GarganText"
en,title,"# Isidore To GarganText" en,title,"# Isidore To GarganText"
fr,text,"Effectue une recherche Isidore de documents scientifiques et les convertit en un fichier TSV." fr,text,"Effectue une recherche Isidore de documents scientifiques et les convertir en un fichier TSV."
en,text,"Do a Isidore scientific documents research and convert it into a TSV file." en,text,"Do a Isidore scientific documents research and convert it into a TSV file."
fr,keyword,"Mots clés" fr,keyword,"Mots clés"
...@@ -23,10 +23,10 @@ en,overload'api,"The API is overloaded, please retry the request in a few second ...@@ -23,10 +23,10 @@ en,overload'api,"The API is overloaded, please retry the request in a few second
fr,nb_doc,"Nombre de documents : " fr,nb_doc,"Nombre de documents : "
en,nb_doc,"Number of documents : " en,nb_doc,"Number of documents : "
fr,perform1,"Pour des raisons de performence, on limite à " fr,perform1,"Pour des raisons de performances, on limite à "
fr,perform2," le nombre maximum de documents." fr,perform2," le nombre de documents maximums"
en,perform1,"For performance reasons, we limit to " en,perform1,"For performance reasons, we limit to "
en,perform2," ,the maximum number of documents." en,perform2," the maximum number of documents"
fr,nb_taken,"Nombre de documents à prendre" fr,nb_taken,"Nombre de documents à prendre"
en,nb_taken,"Number of documents to take into account" en,nb_taken,"Number of documents to take into account"
...@@ -35,6 +35,6 @@ fr,createTSV,"Création du fichier TSV (cela peut prendre quelques minutes)" ...@@ -35,6 +35,6 @@ fr,createTSV,"Création du fichier TSV (cela peut prendre quelques minutes)"
en,createTSV,"Creation of the TSV file (it may take a while)" en,createTSV,"Creation of the TSV file (it may take a while)"
fr,doc_abstract1,"Il y a " fr,doc_abstract1,"Il y a "
fr,doc_abstract2," documents qui peuvent ne pas avoir de description." fr,doc_abstract2," documents qui peuvent ne pas avoir de descriptions."
en,doc_abstract1,"There are " en,doc_abstract1,"There are "
en,doc_abstract2," documents who may not have an abstract" en,doc_abstract2," documents who may not have an abstract"
\ No newline at end of file
...@@ -11,7 +11,7 @@ en,file,"Choose a file" ...@@ -11,7 +11,7 @@ en,file,"Choose a file"
fr,dup1,"Certains fichiers (" fr,dup1,"Certains fichiers ("
fr,dup2,") ont été retirés pour diverses raisons (fichier au mauvais format, fichiers identiques au regard des données utilisées par GarganText...)" fr,dup2,") ont été retirés pour diverses raisons (fichier au mauvais format, fichiers identiques au regard des données utilisées par GarganText...)"
en,dup1,"Some file (" en,dup1,"Some file ("
en,dup2,") have been removed for various reasons (file with wrong format, file already present...)" en,dup2,") have been removed for various reasons (especially indentic file, unusable format...)"
fr,new_file,"Télécharger le fichier TSV :" fr,new_file,"Télécharger le fichier TSV :"
en,new_file,"Download the TSV file:" en,new_file,"Download the TSV file:"
......
...@@ -11,5 +11,5 @@ en,file,"Choose a file" ...@@ -11,5 +11,5 @@ en,file,"Choose a file"
fr,new_file,"Télécharger le fichier TSV :" fr,new_file,"Télécharger le fichier TSV :"
en,new_file,"Download le TSV file:" en,new_file,"Download le TSV file:"
fr,error,"Erreur : le fichier n'est pas valide" fr,error,"Erreur : le fichier n'est pas valide !"
en,error,"Error : the file isn't valid" en,error,"Error : the file isn't valid !"
\ No newline at end of file \ No newline at end of file
...@@ -35,5 +35,5 @@ en,loading,"Videos processing : " ...@@ -35,5 +35,5 @@ en,loading,"Videos processing : "
fr,quantity," sur " fr,quantity," sur "
en,quantity," out of " en,quantity," out of "
fr,new_file,"Télécharge ton fichier TSV :" fr,new_file,"Téléchargez votre fichier TSV :"
en,new_file,"Download your TSV file :" en,new_file,"Download your TSV file :"
...@@ -14,8 +14,8 @@ en,submit,"Submit" ...@@ -14,8 +14,8 @@ en,submit,"Submit"
fr,denied,"L'accès au compte n'est pas public, pour le rendre public: https://www.zotero.org/settings/privacy" fr,denied,"L'accès au compte n'est pas public, pour le rendre public: https://www.zotero.org/settings/privacy"
en,denied,"Account access is not public, to make it public: https://www.zotero.org/settings/privacy" en,denied,"Account access is not public, to make it public: https://www.zotero.org/settings/privacy"
fr,add_doc,"*Ajouter les documents que vous voulez mettre dans le TSV*" fr,add_doc,"*Ajoutez les documents que vous voulez mettre dans le TSV*"
en,add_doc,"*Add the document that tou want in the TSV*" en,add_doc,"*Add the document that you want in the TSV*"
fr,select_all,"Select All" fr,select_all,"Select All"
en,select_all,"Select All" en,select_all,"Select All"
......
...@@ -39,54 +39,29 @@ def getSeparator(file): ...@@ -39,54 +39,29 @@ def getSeparator(file):
return '\t', False return '\t', False
def checkPublicationCase(tmp, split, success): def lowerName(name):
if split: tmp = name
if tmp[0][0].isupper() or tmp[1][0].isupper():
return False
else:
return success
if not tmp[0][0].isupper() or not tmp[1][0].isupper():
return False
return success
def checkPublication(name, registeredNames, errorMessage):
tmpName = name
if re.search('[a-zA-Z0-9]', name[0]) == None: if re.search('[a-zA-Z0-9]', name[0]) == None:
tmpName = name[1:] tmp = name[1:]
tmp = tmpName.split(' ') if len(tmp) < 9:
success = True return tmp.lower()
tmp = name.split(' ')
split = False split = False
first = "" first = ""
second = "" second = ""
if "_" in tmp[0] and len(tmp) == 1: if len(tmp) == 1 and "_" in tmp[0]:
tmp = tmp[0].split('_') tmp = tmp[0].split('_')
split = True split = True
if len(tmp) != 2: if len(tmp) != 2:
success = False return name.lower()
else: else:
success = checkPublicationCase(tmp, split, success)
first = tmp[0][0].lower() + tmp[0][1:] first = tmp[0][0].lower() + tmp[0][1:]
second = tmp[1][0].lower() + tmp[1][1:] second = tmp[1][0].lower() + tmp[1][1:]
if first != "publication" or second not in ["day", "month", "year"]: return first + "_" + second
success = False
if not success:
errorMessage += "Error at line 1 ! Wrong name : " + \
name + " is not appropriated ! \n"
else:
registeredNames.append(first + "_" + second)
return success, errorMessage
def checkNameValidity(name, columnNames, registeredNames, errorMessage): def checkNameValidity(name, columnNames, registeredNames, errorMessage):
tmpName = name if name in registeredNames:
if re.search('[a-zA-Z0-9]', name[0]) == None:
tmpName = name[1:]
if tmpName not in columnNames:
errorMessage += "Error at line 1 ! Wrong name : " + \
name + " is not appropriated ! \n"
return False, errorMessage
if tmpName in registeredNames:
errorMessage += "Error at line 1 ! Same name for 2 differents columns! \n" errorMessage += "Error at line 1 ! Same name for 2 differents columns! \n"
return False, errorMessage return False, errorMessage
return True, errorMessage return True, errorMessage
...@@ -105,23 +80,30 @@ def checkColumnExistence(registeredNames, errorMessage): ...@@ -105,23 +80,30 @@ def checkColumnExistence(registeredNames, errorMessage):
return True, errorMessage return True, errorMessage
def checkColumnNames(name, errorMessage, registeredNames, success): def checkColumnNames(name, errorMessage, registeredNames, otherColumns, success):
columnNames = ["authors", "title", "publication_year", columnNames = ["authors", "title", "source", "publication_year",
"publication_month", "publication_day", "abstract", "source"] "publication_month", "publication_day", "abstract"]
name = name.replace("\n", "") name = name.replace("\n", "")
if len(name) > 9: tmpSuccess, errorMessage = checkNameValidity(name, columnNames, registeredNames, errorMessage)
tmpSuccess, errorMessage = checkPublication( if tmpSuccess:
name, registeredNames, errorMessage) if lowerName(name) in columnNames:
else: registeredNames.append(name)
name = name.replace(" ", "") else :
tmpSuccess, errorMessage = checkNameValidity( otherColumns.append(name)
name[0].lower() + name[1:], columnNames, registeredNames, errorMessage) if success :
if tmpSuccess:
registeredNames.append(name[0].lower() + name[1:])
if success:
success = tmpSuccess success = tmpSuccess
return success, errorMessage, registeredNames return errorMessage, registeredNames, otherColumns, success
def addColumnsNamestoTSV(data, registeredNames, otherColumns):
for name in registeredNames :
if data != "":
data += "\t"
data += name
for name in otherColumns :
data += "\t"
data += name
return data
def getColumnsNames(file, separator, errorMessage): def getColumnsNames(file, separator, errorMessage):
data = "" data = ""
...@@ -130,40 +112,21 @@ def getColumnsNames(file, separator, errorMessage): ...@@ -130,40 +112,21 @@ def getColumnsNames(file, separator, errorMessage):
success = True success = True
reader = csv.DictReader(codecs.iterdecode( reader = csv.DictReader(codecs.iterdecode(
file, 'utf-8'), delimiter=separator) file, 'utf-8'), delimiter=separator)
columnsNames = [] othersColumns = []
for row in reader: for row in reader:
for name, value in row.items(): for name, value in row.items():
columnName = name.replace("\ufeff", "") columnName = name.replace("\ufeff", "")
if (columnNb < 7): errorMessage, registeredNames, otherColumns, success = checkColumnNames(
success, errorMessage, registeredNames = checkColumnNames( name, errorMessage, registeredNames, othersColumns, success)
name, errorMessage, registeredNames, success)
if data != "":
data += "\t"
data += columnName
columnNb += 1
success, errorMessage = checkColumnExistence( success, errorMessage = checkColumnExistence(
registeredNames, errorMessage) registeredNames, errorMessage)
if success :
data = addColumnsNamestoTSV(data, registeredNames, otherColumns)
break break
data += "\n" data += "\n"
return data, success, errorMessage return data, success, errorMessage
def lowerName(name):
tmp = name.split(' ')
split = False
first = ""
second = ""
if len(tmp) == 1 and "_" in tmp[0]:
tmp = tmp[0].split('_')
split = True
if len(tmp) != 2:
return name.lower()
else:
first = tmp[0][0].lower() + tmp[0][1:]
second = tmp[1][0].lower() + tmp[1][1:]
return first + "_" + second
def checkDate(name, value, success, fill, csvLine, errorMessage): def checkDate(name, value, success, fill, csvLine, errorMessage):
if name in ["publication_year", "publication_month", "publication_day"]: if name in ["publication_year", "publication_month", "publication_day"]:
if value == "" or value == "\n": if value == "" or value == "\n":
...@@ -210,43 +173,45 @@ def correctedSequence(text): ...@@ -210,43 +173,45 @@ def correctedSequence(text):
tmp = "\"" + tmp + "\"" tmp = "\"" + tmp + "\""
return tmp return tmp
def getContent(file, separator, data, success, fill, errorMessage): def getContent(file, separator, data, success, fill, errorMessage):
reader = csv.DictReader(codecs.iterdecode( columnNames = ["authors", "title", "source", "publication_year",
file, 'utf-8'), delimiter=separator) "publication_month", "publication_day", "abstract"]
csvLine = 2 csvLine = 2
columnNb = 0 reader = csv.DictReader(codecs.iterdecode(file, 'utf-8'), delimiter=separator)
for row in reader: for row in reader:
tmp = ""
first = True first = True
tsv1 = ""
tsv2 = ""
for name, value in row.items(): for name, value in row.items():
tmpFill = "" tmpFill = ""
if not first: if lowerName(name) in columnNames:
tmp += "\t" if not first :
else: tsv1 += "\t"
first = False
if (columnNb < 7):
success, tmpFill, errorMessage = checkMissing( success, tmpFill, errorMessage = checkMissing(
lowerName(name), value, success, fill, csvLine, errorMessage) lowerName(name), value, success, fill, csvLine, errorMessage)
if tmpFill != "": if tmpFill != "":
tmp += tmpFill tsv1 += tmpFill
else: else:
success, tmpFill, errorMessage = checkDate( success, tmpFill, errorMessage = checkDate(
lowerName(name), value, success, fill, csvLine, errorMessage) lowerName(name), value, success, fill, csvLine, errorMessage)
tmp += correctedSequence(value) tsv1 += correctedSequence(value)
else: else :
tmp += correctedSequence(value) success, tmpFill, errorMessage = checkMissing(
columnNb += 1 lowerName(name), value, success, fill, csvLine, errorMessage)
columnNb = 0 if tmpFill != "":
tsv2 +="\t" + tmpFill
else:
tsv2 += "\t" + correctedSequence(value)
if first:
first = False
csvLine += 1 csvLine += 1
data += tmp + "\n" data += tsv1 + tsv2 + "\n"
return data[:-1], success, errorMessage return data[:-1], success, errorMessage
# Code End # Code End
st.write(st.session_state.general_text_dict['text']) st.write(st.session_state.general_text_dict['text'])
st.session_state.fill = st.checkbox(st.session_state.general_text_dict['fill']) st.session_state.fill = st.checkbox(value = True, label = st.session_state.general_text_dict['fill'])
file = st.file_uploader( file = st.file_uploader(
st.session_state.general_text_dict['file'], type=["tsv", "csv"], key='file') st.session_state.general_text_dict['file'], type=["tsv", "csv"], key='file')
......
import streamlit as st
import networkx as nx
import src.basic as tmp
tmp.base('GEXFToTermOcc')
def create_file(file):
tmp = file.getvalue().decode('utf-8') \
.replace('version="1.3"', 'version="1.2"') \
.replace('xmlns="http://www.gexf.net/1.3"', 'xmlns="http://www.gexf.net/1.2draft"') \
.replace('xmlns:viz="http://gexf.net/1.3/viz"', 'xmlns:viz="http://www.gexf.net/1.2draft/viz"') \
.replace('xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"', 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"') \
.replace('xsi:schemaLocation="http://gexf.net/1.3 http://gexf.net/1.3/gexf.xsd"', 'xsi:schemaLocation="http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd"') \
.encode()
file.seek(0,0)
file.write(tmp)
file.seek(0,0)
tmp = nx.read_gexf(file, version='1.2draft')
lst=[]
for elem in tmp.nodes(True):
lst.append((elem[1]['label'],elem[1]['viz']['size']))
lst.sort(key= lambda x: x[1], reverse=True)
res = 'mapTerm\tocc\n'
for elem in lst:
res += elem[0] + '\t' + str(int(elem[1])) + '\n'
return res
st.write(st.session_state.general_text_dict['text'])
file = st.file_uploader(st.session_state.general_text_dict['file'],type=["gexf"],key='file')
if file:
try:
st.write(st.session_state.general_text_dict['new_file'])
st.download_button('Download TSV', create_file(file), 'output.csv')
except Exception as e:
st.write(st.session_state.general_text_dict['error'])
\ No newline at end of file
...@@ -171,10 +171,10 @@ if st.session_state.stage_isidore > 0: ...@@ -171,10 +171,10 @@ if st.session_state.stage_isidore > 0:
form2.write(st.session_state.general_text_dict['perform1'] + str( form2.write(st.session_state.general_text_dict['perform1'] + str(
limitItems) + st.session_state.general_text_dict['perform2']) limitItems) + st.session_state.general_text_dict['perform2'])
st.session_state.nb_wanted = form2.slider( st.session_state.nb_wanted = form2.slider(
st.session_state.general_text_dict['nb_taken'], 1, limitItems) st.session_state.general_text_dict['nb_taken'], 10, limitItems, 10, 10)
else: else:
st.session_state.nb_wanted = form2.slider( st.session_state.nb_wanted = form2.slider(
st.session_state.general_text_dict['nb_taken'], 1, int(st.session_state.nb_doc)) st.session_state.general_text_dict['nb_taken'], 10, int(st.session_state.nb_doc), 10, 10)
form2.form_submit_button( form2.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=set_stage, args=(2,)) st.session_state.general_text_dict['submit'], on_click=set_stage, args=(2,))
......
...@@ -19,7 +19,7 @@ numberReplies = 500 # Dont' exceed 1 000 ...@@ -19,7 +19,7 @@ numberReplies = 500 # Dont' exceed 1 000
limitItems = 5000 # Can't be superior of 10 times numberReplies limitItems = 5000 # Can't be superior of 10 times numberReplies
retryTime = 2 retryTime = 2
## Connect to Isidore API to get the numbers of docs from the research
def loadApiIsidoreNumberFile(search, language): def loadApiIsidoreNumberFile(search, language):
while (True): while (True):
url = 'https://api.isidore.science/resource/search?q=' + search + \ url = 'https://api.isidore.science/resource/search?q=' + search + \
...@@ -39,7 +39,7 @@ def loadApiIsidoreNumberFile(search, language): ...@@ -39,7 +39,7 @@ def loadApiIsidoreNumberFile(search, language):
return docs return docs
## Connect to Isidore API to get the documents from the pages
def loadApiIsidorePage(search, language, page): def loadApiIsidorePage(search, language, page):
url = 'https://api.isidore.science/resource/search?q=' + search + '&output=json&replies=' + \ url = 'https://api.isidore.science/resource/search?q=' + search + '&output=json&replies=' + \
str(numberReplies) + '&page=' + str(page) + \ str(numberReplies) + '&page=' + str(page) + \
...@@ -58,6 +58,7 @@ def loadApiIsidorePage(search, language, page): ...@@ -58,6 +58,7 @@ def loadApiIsidorePage(search, language, page):
def create_output(search, language, nb_doc): def create_output(search, language, nb_doc):
output = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n" output = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
nb = 0 nb = 0
## nb is used to return ther number of file with
for i in range(1, nb_doc//numberReplies + 1): for i in range(1, nb_doc//numberReplies + 1):
while (True): while (True):
txt = loadApiIsidorePage(search, language, i) txt = loadApiIsidorePage(search, language, i)
...@@ -68,6 +69,8 @@ def create_output(search, language, nb_doc): ...@@ -68,6 +69,8 @@ def create_output(search, language, nb_doc):
tmp, nb_tmp = createFile(txt, numberReplies, language) tmp, nb_tmp = createFile(txt, numberReplies, language)
output += tmp output += tmp
nb += nb_tmp nb += nb_tmp
## If their is still some document do find (for exampe with 1160 documents, their is still 160 documents to find after the first part)
if nb_doc % numberReplies != 0: if nb_doc % numberReplies != 0:
while (True): while (True):
txt = loadApiIsidorePage(search, language, nb_doc//numberReplies + 1) txt = loadApiIsidorePage(search, language, nb_doc//numberReplies + 1)
...@@ -151,7 +154,6 @@ def createFile(docs, limit, language): ...@@ -151,7 +154,6 @@ def createFile(docs, limit, language):
abstract = '' abstract = ''
if 'types' in doc['isidore'].keys(): if 'types' in doc['isidore'].keys():
print(i)
if type(doc['isidore']['types']['type']) == str and doc['isidore']['types']['type'] in ['Books', 'text']: if type(doc['isidore']['types']['type']) == str and doc['isidore']['types']['type'] in ['Books', 'text']:
nb += 1 nb += 1
elif type(doc['isidore']['types']['type']) == dict and doc['isidore']['types']['type']['$'] in ['Books', 'text']: elif type(doc['isidore']['types']['type']) == dict and doc['isidore']['types']['type']['$'] in ['Books', 'text']:
...@@ -249,7 +251,7 @@ form.form_submit_button( ...@@ -249,7 +251,7 @@ form.form_submit_button(
# API and Slider # API and Slider
if st.session_state.stage_isidore > 0: if st.session_state.stage_isidore > 0:
# Only call first time and after # Only call first time and after an update in the first form
if 'search' not in st.session_state or 'language' not in st.session_state or search != st.session_state.search or language != st.session_state.language: if 'search' not in st.session_state or 'language' not in st.session_state or search != st.session_state.search or language != st.session_state.language:
with st.spinner(st.session_state.general_text_dict['load_api']): with st.spinner(st.session_state.general_text_dict['load_api']):
nb_doc = int(loadApiIsidoreNumberFile(search, lang[language])) nb_doc = int(loadApiIsidoreNumberFile(search, lang[language]))
...@@ -269,10 +271,10 @@ if st.session_state.stage_isidore > 0: ...@@ -269,10 +271,10 @@ if st.session_state.stage_isidore > 0:
form2.write(st.session_state.general_text_dict['perform1'] + str( form2.write(st.session_state.general_text_dict['perform1'] + str(
limitItems) + st.session_state.general_text_dict['perform2']) limitItems) + st.session_state.general_text_dict['perform2'])
st.session_state.nb_wanted = form2.slider( st.session_state.nb_wanted = form2.slider(
st.session_state.general_text_dict['nb_taken'], 1, limitItems) st.session_state.general_text_dict['nb_taken'], 10, limitItems, 10, 10)
else: else:
st.session_state.nb_wanted = form2.slider( st.session_state.nb_wanted = form2.slider(
st.session_state.general_text_dict['nb_taken'], 1, int(st.session_state.nb_doc)) st.session_state.general_text_dict['nb_taken'], 10, int(st.session_state.nb_doc), 10, 10)
form2.form_submit_button( form2.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=set_stage, args=(2,)) st.session_state.general_text_dict['submit'], on_click=set_stage, args=(2,))
......
...@@ -5,7 +5,7 @@ Loïc Chapron ...@@ -5,7 +5,7 @@ Loïc Chapron
import json import json
import pandas as pd import pandas as pd
import datetime from datetime import datetime
import zipfile import zipfile
import streamlit as st import streamlit as st
import src.basic as tmp import src.basic as tmp
...@@ -60,8 +60,6 @@ def read_zip(zip_file): ...@@ -60,8 +60,6 @@ def read_zip(zip_file):
temp["publication_year"] = article["publicationDate"][0] temp["publication_year"] = article["publicationDate"][0]
except: except:
temp["publication_year"] = datetime.date.today().year temp["publication_year"] = datetime.date.today().year
temp["publication_year"] = article.get(
"publicationDate", datetime.date.today().year)[0]
temp["publication_month"] = 1 temp["publication_month"] = 1
temp["publication_day"] = 1 temp["publication_day"] = 1
...@@ -78,7 +76,7 @@ def read_zip(zip_file): ...@@ -78,7 +76,7 @@ def read_zip(zip_file):
if (duplicated.any()): if (duplicated.any()):
dup += duplicated.sum() dup += duplicated.sum()
output.drop(['code'], axis=1) output = output.drop(['code'], axis=1)
output = output[~duplicated] output = output[~duplicated]
df = pd.DataFrame(output) df = pd.DataFrame(output)
return df.to_csv(index=False, sep='\t'), dup return df.to_csv(index=False, sep='\t'), dup
......
...@@ -13,6 +13,8 @@ import re ...@@ -13,6 +13,8 @@ import re
import chardet import chardet
import pandas as pd import pandas as pd
import streamlit as st import streamlit as st
import lib.tika.tika as tika
tika.initVM()
from lib.tika.tika import parser from lib.tika.tika import parser
from lib.langdetect.langdetect import detect from lib.langdetect.langdetect import detect
from lib.langdetect.langdetect.lang_detect_exception import LangDetectException from lib.langdetect.langdetect.lang_detect_exception import LangDetectException
...@@ -136,7 +138,7 @@ def segmentAbstract(fileName, fileAddress, tsv, author, source, year, month, day ...@@ -136,7 +138,7 @@ def segmentAbstract(fileName, fileAddress, tsv, author, source, year, month, day
count = 1 count = 1
languages = {} languages = {}
while n < nbLines - 2: while n < nbLines - 2:
doc = "\n".join(abstract[n: n + 9]).replace("�", "") doc = "\n".join(abstract[n: n + 9]).replace("�", "").replace("", "")
title = source + " : Part " + str(count) title = source + " : Part " + str(count)
tsv += correctedSequence(author, False) + "\t" + correctedSequence( tsv += correctedSequence(author, False) + "\t" + correctedSequence(
source, False) + "\t" + year + "\t" + month + "\t" + day + "\t" source, False) + "\t" + year + "\t" + month + "\t" + day + "\t"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment