Commit 2f362ef1 authored by Anne-Laure Thomas Derepas's avatar Anne-Laure Thomas Derepas

Merge branch 'dev' into 'master'

Dev

See merge request !10
parents f93762e6 92a7e0f5
[[pages]] [[pages]]
path = "Welcome.py" path = "Homepage.py"
name = "Home" name = "Homepage"
icon = ":house:" icon = ":house:"
[[pages]] [[pages]]
......
...@@ -4,13 +4,13 @@ Loïc Chapron ...@@ -4,13 +4,13 @@ Loïc Chapron
""" """
import streamlit as st import streamlit as st
import src.basic as tmp import pandas as pd
import src.basic as tmp
tmp.base("Welcome") tmp.base("Homepage")
st.write(st.session_state.general_text_dict['welcome']) st.write(st.session_state.general_text_dict['welcome'])
st.write(st.session_state.general_text_dict['tools']) st.write(st.session_state.general_text_dict['tools'])
st.write(st.session_state.general_text_dict['code']) st.write(st.session_state.general_text_dict['code'])
st.write(st.session_state.general_text_dict['help']) st.write(st.session_state.general_text_dict['help'])
...@@ -15,7 +15,7 @@ pip install youtube-transcript-api ...@@ -15,7 +15,7 @@ pip install youtube-transcript-api
## Start Project ## Start Project
```shell ```shell
streamlit run welcome.py streamlit run Homepage.py
``` ```
## About YTB to TSV tool ## About YTB to TSV tool
......
...@@ -31,5 +31,5 @@ en,perform2," the maximum number of documents" ...@@ -31,5 +31,5 @@ en,perform2," the maximum number of documents"
fr,nb_taken,"Nombres de documents à prendre" fr,nb_taken,"Nombres de documents à prendre"
en,nb_taken,"Number of documents to take" en,nb_taken,"Number of documents to take"
fr,createTSV,"Création du fichier TSV (Cela peut prendre quelque minutes)" fr,createTSV,"Création du fichier TSV (Cela peut prendre quelques minutes)"
en,createTSV,"Creation of the TSV file (It may take a while)" en,createTSV,"Creation of the TSV file (It may take a while)"
...@@ -2,7 +2,7 @@ locale,key,value ...@@ -2,7 +2,7 @@ locale,key,value
fr,title,"# Isidore vers GarganText" fr,title,"# Isidore vers GarganText"
en,title,"# Isidore To GarganText" en,title,"# Isidore To GarganText"
fr,text,"Effectue une recherche Isidore de documents scientifiques et les convertir en un fichier TSV." fr,text,"Effectue une recherche Isidore de documents scientifiques et les convertit en un fichier TSV."
en,text,"Do a Isidore scientific documents research and convert it into a TSV file." en,text,"Do a Isidore scientific documents research and convert it into a TSV file."
fr,keyword,"Mots clés" fr,keyword,"Mots clés"
...@@ -17,21 +17,21 @@ en,submit,"Submit" ...@@ -17,21 +17,21 @@ en,submit,"Submit"
fr,load_api,"Chargement de l'api..." fr,load_api,"Chargement de l'api..."
en,load_api,"Loading API..." en,load_api,"Loading API..."
fr,overload_api,"L'API est surchargé, relancer la requête dans quelques secondes" fr,overload_api,"L'API est surchargé, relancer la requête dans quelques secondes."
en,overload'api,"The API is overloaded, please retry the request in a few seconds" en,overload'api,"The API is overloaded, please retry the request in a few seconds."
fr,nb_doc,"Nombres de documents : " fr,nb_doc,"Nombres de documents : "
en,nb_doc,"Numbers of documents : " en,nb_doc,"Numbers of documents : "
fr,perform1,"Pour des raisons de performence, on limit à " fr,perform1,"Pour des raisons de performence, on limite à "
fr,perform2," le nombre de document maximum" fr,perform2," le nombre maximum de documents."
en,perform1,"For performance reasons, we limit to " en,perform1,"For performance reasons, we limit to "
en,perform2," the maximum number of documents" en,perform2," ,the maximum number of documents."
fr,nb_taken,"Nombres de documents à prendre" fr,nb_taken,"Nombres de documents à prendre"
en,nb_taken,"Number of documents to take" en,nb_taken,"Number of documents to take"
fr,createTSV,"Création du fichier TSV (Cela peut prendre quelque minutes)" fr,createTSV,"Création du fichier TSV (Cela peut prendre quelques minutes)"
en,createTSV,"Creation of the TSV file (It may take a while)" en,createTSV,"Creation of the TSV file (It may take a while)"
fr,doc_abstract1,"Il y a " fr,doc_abstract1,"Il y a "
......
...@@ -29,8 +29,8 @@ en,watermark,"Watermark : " ...@@ -29,8 +29,8 @@ en,watermark,"Watermark : "
fr,submit," Soumettre " fr,submit," Soumettre "
en,submit,"Submit " en,submit,"Submit "
fr,loading," Conversion du pdf en cours " fr,loading," Conversion du PDF en cours "
en,loading," Processing pdf conversion " en,loading," Processing PDF conversion "
fr,warning,"Attention ! Plusieurs langues ont été détectées pour la source : " fr,warning,"Attention ! Plusieurs langues ont été détectées pour la source : "
fr,warning2,"Les langues suivantes ont été détectées : " fr,warning2,"Les langues suivantes ont été détectées : "
......
...@@ -5,11 +5,11 @@ en,title,"# TXT To TSV" ...@@ -5,11 +5,11 @@ en,title,"# TXT To TSV"
fr,text,"Convertit un fichier TXT en un fichier TSV compatible avec Gargantext" fr,text,"Convertit un fichier TXT en un fichier TSV compatible avec Gargantext"
en,text,"Convert a TXT file into a TSV file compatible with GarganText" en,text,"Convert a TXT file into a TSV file compatible with GarganText"
fr,text2,"Cet outil détecte automatiquement les langues présentes au sein des PDF à l'aide de l'API Google Translate." fr,text2,"Convertit un ZIP de fichiers TXT en fichiers TSV compatibles avec Gargantext"
en,text2,"This tool detect automatically the languages of the PDF with the Google Translate API." en,text2,"Convert a ZIP of TXT files into TSV files compatible with GarganText"
fr,text3,"Vous pouvez choisir le titre et le(s) auteur(s) et indiquer, s'il existe, le filigrane de ce PDF." fr,text3,"Vous pouvez choisir le titre et le(s) auteur(s) de ce TXT."
en,text3,"You can choose the title and the author(s) and specify, if it does exist, the watermark for this PDF." en,text3,"You can choose the title and the author(s) for this TXT."
fr,file,"Choisir un fichier" fr,file,"Choisir un fichier"
en,file,"Choose a file" en,file,"Choose a file"
...@@ -20,12 +20,15 @@ en,new_file,"Download your TSV file : " ...@@ -20,12 +20,15 @@ en,new_file,"Download your TSV file : "
fr,author,"Auteur(s) : " fr,author,"Auteur(s) : "
en,author,"Author(s) : " en,author,"Author(s) : "
fr,titlePDF,"Titre : " fr,titleTXT,"Titre : "
en,titlePDF,"Title : " en,titleTXT,"Title : "
fr,submit," Soumettre " fr,submit," Soumettre "
en,submit,"Submit " en,submit,"Submit "
fr,loading," Conversion du TXT en cours "
en,loading," Processing TXT conversion "
fr,warning,"Attention ! Plusieurs langues ont été détectées pour la source : " fr,warning,"Attention ! Plusieurs langues ont été détectées pour la source : "
fr,warning2,"Les langues suivantes ont été détectées : " fr,warning2,"Les langues suivantes ont été détectées : "
en,warning,"Warning ! Multiple languages have been detected at the source : " en,warning,"Warning ! Multiple languages have been detected at the source : "
......
...@@ -141,4 +141,4 @@ if st.session_state.page == 1: ...@@ -141,4 +141,4 @@ if st.session_state.page == 1:
tsv = HarzingToTsv(separator) tsv = HarzingToTsv(separator)
name = st.session_state.file.name.split('.')[0] + '.tsv' name = st.session_state.file.name.split('.')[0] + '.tsv'
st.write(st.session_state.general_text_dict['new_file']) st.write(st.session_state.general_text_dict['new_file'])
st.download_button(name, tsv, name, on_click=resetPage()) st.download_button(name, tsv, name, on_click=resetPage)
...@@ -190,5 +190,7 @@ if st.session_state.stage_isidore > 1: ...@@ -190,5 +190,7 @@ if st.session_state.stage_isidore > 1:
print(st.session_state.nb_wanted) print(st.session_state.nb_wanted)
st.session_state.output = create_output( st.session_state.output = create_output(
st.session_state.search, lang[st.session_state.language], st.session_state.nb_wanted) st.session_state.search, lang[st.session_state.language], st.session_state.nb_wanted)
st.download_button('Download TSV', st.session_state.output, 'output.csv')
fileName = "HALOutput_" + str(datetime.now().strftime("%Y-%m-%d_%H:%M:%S")) + '.csv'
st.download_button('Download TSV', st.session_state.output, fileName)
...@@ -7,6 +7,7 @@ import streamlit as st ...@@ -7,6 +7,7 @@ import streamlit as st
import requests as req import requests as req
import json import json
import time import time
from datetime import datetime
from json import JSONDecodeError from json import JSONDecodeError
import src.basic as tmp import src.basic as tmp
...@@ -64,11 +65,16 @@ def create_output(search, language, nb_doc): ...@@ -64,11 +65,16 @@ def create_output(search, language, nb_doc):
break break
time.sleep(retryTime) time.sleep(retryTime)
print('Retry') print('Retry')
tmp, nb_tmp = createFile(txt, nb_doc % numberReplies, language) tmp, nb_tmp = createFile(txt, numberReplies, language)
output += tmp output += tmp
nb += nb_tmp nb += nb_tmp
if nb_doc % numberReplies != 0: if nb_doc % numberReplies != 0:
txt = loadApiIsidorePage(search, language, nb_doc//numberReplies + 1) while (True):
txt = loadApiIsidorePage(search, language, nb_doc//numberReplies + 1)
if txt != 0:
break
time.sleep(retryTime)
print('Retry')
tmp, nb_tmp = createFile(txt, nb_doc % numberReplies, language) tmp, nb_tmp = createFile(txt, nb_doc % numberReplies, language)
output += tmp output += tmp
nb += nb_tmp nb += nb_tmp
...@@ -139,12 +145,16 @@ def createFile(docs, limit, language): ...@@ -139,12 +145,16 @@ def createFile(docs, limit, language):
else: else:
abstract = tmp abstract = tmp
else: else:
abstract = abstract['$'] if '$' in abstract.keys():
abstract = abstract['$']
else:
abstract = ''
if 'types' in doc['isidore'].keys(): if 'types' in doc['isidore'].keys():
if type(doc['isidore']['types']['type'] == str) and doc['isidore']['types']['type'] in ['Books', 'text']: print(i)
if type(doc['isidore']['types']['type']) == str and doc['isidore']['types']['type'] in ['Books', 'text']:
nb += 1 nb += 1
elif type(doc['isidore']['types']['type'] == dict) and doc['isidore']['types']['type'][1] in ['Books', 'text']: elif type(doc['isidore']['types']['type']) == dict and doc['isidore']['types']['type']['$'] in ['Books', 'text']:
nb += 1 nb += 1
else: else:
print(title) print(title)
...@@ -280,4 +290,5 @@ if st.session_state.stage_isidore > 1: ...@@ -280,4 +290,5 @@ if st.session_state.stage_isidore > 1:
st.write(st.session_state.general_text_dict['doc_abstract1'] + str( st.write(st.session_state.general_text_dict['doc_abstract1'] + str(
st.session_state.nb_bad_file) + st.session_state.general_text_dict['doc_abstract2']) st.session_state.nb_bad_file) + st.session_state.general_text_dict['doc_abstract2'])
st.download_button('Download TSV', st.session_state.output, 'output.csv') fileName = "isidoreOutput_" + str(datetime.now().strftime("%Y-%m-%d_%H:%M:%S")) + '.csv'
st.download_button('Download TSV', st.session_state.output, fileName)
...@@ -91,13 +91,13 @@ file = st.file_uploader( ...@@ -91,13 +91,13 @@ file = st.file_uploader(
if file: if file:
try: try:
name = file.name.split('.')[0] + '.csv' fileName = "istexOutput_" + str(datetime.now().strftime("%Y-%m-%d_%H:%M:%S")) + '.csv'
res, nb_dup = read_zip(file) res, nb_dup = read_zip(file)
if nb_dup: if nb_dup:
st.write(st.session_state.general_text_dict['dup1'] + str( st.write(st.session_state.general_text_dict['dup1'] + str(
nb_dup) + st.session_state.general_text_dict['dup2']) nb_dup) + st.session_state.general_text_dict['dup2'])
st.write(st.session_state.general_text_dict['new_file']) st.write(st.session_state.general_text_dict['new_file'])
st.download_button(name, res, name) st.download_button('Download TSV', res, fileName)
except Exception as e: except Exception as e:
st.write(st.session_state.general_text_dict['error']) st.write(st.session_state.general_text_dict['error'])
print(e) print(e)
......
...@@ -151,7 +151,7 @@ def segmentAbstract(fileName, fileAddress, tsv, author, source, year, month, day ...@@ -151,7 +151,7 @@ def segmentAbstract(fileName, fileAddress, tsv, author, source, year, month, day
languages = detectLanguages(doc, languages) languages = detectLanguages(doc, languages)
st.session_state.pdfLanguages[fileName] = detectMultipleLanguages( st.session_state.pdfLanguages[fileName] = detectMultipleLanguages(
languages, fileName) languages, fileName)
return tsv, languages return tsv
def correctedSequence(text, last): def correctedSequence(text, last):
...@@ -173,21 +173,21 @@ def getInfo(): ...@@ -173,21 +173,21 @@ def getInfo():
return st.session_state.author, title, st.session_state.watermark return st.session_state.author, title, st.session_state.watermark
def txtToTSV(fileName, fileAddress, pdfDir): def pdfToTSV(fileName, fileAddress, pdfDir):
st.session_state.page = 1 st.session_state.page = 1
author, title, watermark = getInfo() author, title, watermark = getInfo()
tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract\n" tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract\n"
with st.spinner(st.session_state.general_text_dict['loading']): with st.spinner(st.session_state.general_text_dict['loading']):
tsv, languages = segmentAbstract(fileName, fileAddress, tsv, author, title, tsv = segmentAbstract(fileName, fileAddress, tsv, author, title,
str(date.today().year), "1", "1", watermark) str(date.today().year), "1", "1", watermark)
if '/' in fileName: if '/' in fileName:
fileName = fileName.split('/')[1] fileName = fileName.split('/')[1]
with open(pdfDir + "/" + fileName.replace(".pdf", ".tsv"), "w", encoding="utf-8-sig") as file: with open(pdfDir + "/" + fileName.replace(".pdf", "(pdf).tsv"), "w", encoding="utf-8-sig") as file:
file.write(tsv) file.write(tsv)
tsv = "\n".join(tsv.split("\n")[1:]) tsv = "\n".join(tsv.split("\n")[1:])
return tsv, languages return tsv
def extractAllPDF(zipDir, zipFile): def extractAllPDF(zipDir, zipFile):
...@@ -223,6 +223,10 @@ def setSubmit(): ...@@ -223,6 +223,10 @@ def setSubmit():
st.session_state.submit = True st.session_state.submit = True
def resetPage():
st.session_state.page = 0
def upPage(): def upPage():
st.session_state.page = 2 st.session_state.page = 2
...@@ -234,7 +238,7 @@ def uploadZip(): ...@@ -234,7 +238,7 @@ def uploadZip():
st.file_uploader( st.file_uploader(
st.session_state.general_text_dict['file'], type=["zip"], key='file') st.session_state.general_text_dict['file'], type=["zip"], key='file')
st.form_submit_button( st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=setSubmit()) st.session_state.general_text_dict['submit'], on_click=setSubmit)
def askPDF(fileName): def askPDF(fileName):
...@@ -254,7 +258,7 @@ def askPDF(fileName): ...@@ -254,7 +258,7 @@ def askPDF(fileName):
st.text_input( st.text_input(
st.session_state.general_text_dict['watermark'], key='watermark') st.session_state.general_text_dict['watermark'], key='watermark')
st.form_submit_button( st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=upPage()) st.session_state.general_text_dict['submit'], on_click=upPage)
# Page Code End # Page Code End
...@@ -279,7 +283,7 @@ if st.session_state.page == 0: ...@@ -279,7 +283,7 @@ if st.session_state.page == 0:
if st.session_state.page == 2: if st.session_state.page == 2:
fileName = st.session_state.fileName fileName = st.session_state.fileName
tmp, languages = txtToTSV( tmp = pdfToTSV(
fileName, st.session_state.zipDir.name + '/' + fileName, st.session_state.pdfDir.name) fileName, st.session_state.zipDir.name + '/' + fileName, st.session_state.pdfDir.name)
st.session_state.tsv += "\n" + tmp st.session_state.tsv += "\n" + tmp
if st.session_state.nbDoc == st.session_state.len - 1: if st.session_state.nbDoc == st.session_state.len - 1:
...@@ -306,4 +310,4 @@ if st.session_state.page == 3: ...@@ -306,4 +310,4 @@ if st.session_state.page == 3:
detectMultiplePdfLanguages() detectMultiplePdfLanguages()
st.write(st.session_state.general_text_dict['new_file']) st.write(st.session_state.general_text_dict['new_file'])
st.download_button("PDFCompilation.zip", st.download_button("PDFCompilation.zip",
zip, "PDFCompilation.zip") zip, "PDFCompilation.zip", on_click=resetPage)
...@@ -97,7 +97,7 @@ def askPDF(): ...@@ -97,7 +97,7 @@ def askPDF():
st.file_uploader( st.file_uploader(
st.session_state.general_text_dict['file'], type=["pdf"], key='file') st.session_state.general_text_dict['file'], type=["pdf"], key='file')
st.form_submit_button( st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=setSubmit()) st.session_state.general_text_dict['submit'], on_click=setSubmit)
# Page Code End # Page Code End
...@@ -124,4 +124,4 @@ if st.session_state.page == 1: ...@@ -124,4 +124,4 @@ if st.session_state.page == 1:
name = st.session_state.file.name.split('.')[0] + '.txt' name = st.session_state.file.name.split('.')[0] + '.txt'
st.write(st.session_state.general_text_dict['new_file']) st.write(st.session_state.general_text_dict['new_file'])
st.session_state.submit = False st.session_state.submit = False
st.download_button(name, txt, name, on_click=setPage()) st.download_button(name, txt, name, on_click=setPage)
...@@ -43,12 +43,13 @@ def estimateLanguagesPercentage(languages): ...@@ -43,12 +43,13 @@ def estimateLanguagesPercentage(languages):
for l in languages: for l in languages:
total += languages[l] total += languages[l]
for l in languages: for l in languages:
tmp = (languages[l] / total) * 100 tmp = round((languages[l] / total) * 100, 1)
if tmp >= 15: if tmp >= 15:
res[l] = tmp res[l] = tmp
if st.session_state.detected != "": if st.session_state.detected != "":
st.session_state.detected += "| " st.session_state.detected += "| "
st.session_state.detected += l + " : " + str(tmp) + "%" st.session_state.detected += l + " : " + str(tmp) + "%"
print(res)
return res return res
...@@ -182,10 +183,11 @@ def uploadTSV(): ...@@ -182,10 +183,11 @@ def uploadTSV():
with st.form("Detect"): with st.form("Detect"):
st.write(st.session_state.general_text_dict['text']) st.write(st.session_state.general_text_dict['text'])
st.write(st.session_state.general_text_dict['text2'])
st.file_uploader( st.file_uploader(
st.session_state.general_text_dict['file'], type=["tsv", "csv"], key='file') st.session_state.general_text_dict['file'], type=["tsv", "csv"], key='file')
st.form_submit_button( st.form_submit_button(
st.session_state.general_text_dict['detect'], on_click=setDetect()) st.session_state.general_text_dict['detect'], on_click=setDetect)
def askTranslateLanguages(file): def askTranslateLanguages(file):
...@@ -202,7 +204,7 @@ def askTranslateLanguages(file): ...@@ -202,7 +204,7 @@ def askTranslateLanguages(file):
st.selectbox(st.session_state.general_text_dict['translate2'], st.session_state.languages, st.selectbox(st.session_state.general_text_dict['translate2'], st.session_state.languages,
key='destLang') key='destLang')
st.form_submit_button( st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=setSubmit()) st.session_state.general_text_dict['submit'], on_click=setSubmit)
# Page Code End # Page Code End
...@@ -239,11 +241,11 @@ if st.session_state.page == 2: ...@@ -239,11 +241,11 @@ if st.session_state.page == 2:
st.write(st.session_state.general_text_dict['new_file']) st.write(st.session_state.general_text_dict['new_file'])
name = st.session_state.tmpFile.name name = st.session_state.tmpFile.name
st.download_button(name, st.download_button(name,
tsv, name, on_click=resetPage()) tsv, name, on_click=resetPage)
if st.session_state.page == 3: if st.session_state.page == 3:
st.write( st.write(
st.session_state.general_text_dict['sameLanguages'] + list(st.session_state.languages.keys())[0]) st.session_state.general_text_dict['sameLanguages'] + list(st.session_state.languages.keys())[0])
st.session_state.languages = {} st.session_state.languages = {}
st.button( st.button(
st.session_state.general_text_dict['anotherFile'], on_click=resetPage()) st.session_state.general_text_dict['anotherFile'], on_click=resetPage)
...@@ -4,13 +4,12 @@ Nicolas Atrax ...@@ -4,13 +4,12 @@ Nicolas Atrax
""" """
import streamlit as st import streamlit as st
import pandas as pd import zipfile
import chardet import tempfile
import shutil
import os
import re import re
from datetime import date from datetime import date
import codecs
import os
import tempfile
from lib.langdetect.langdetect import detect from lib.langdetect.langdetect import detect
from lib.langdetect.langdetect.lang_detect_exception import LangDetectException from lib.langdetect.langdetect.lang_detect_exception import LangDetectException
import src.basic as tmp import src.basic as tmp
...@@ -70,6 +69,18 @@ def detectMultipleLanguages(languages, fileName): ...@@ -70,6 +69,18 @@ def detectMultipleLanguages(languages, fileName):
return principal return principal
def detectMultipleTxtLanguages():
languages = []
for l in st.session_state.txtLanguages.values():
if l not in languages and len(languages) == 1:
st.info(st.session_state.general_text_dict['globalWarning'])
st.info(str(st.session_state.txtLanguages))
st.info(st.session_state.general_text_dict['advice'])
return
if len(languages) == 0:
languages.append(l)
def segmentAbstract(abstract, tsv): def segmentAbstract(abstract, tsv):
year = str(date.today().year) year = str(date.today().year)
month = "1" month = "1"
...@@ -101,6 +112,35 @@ def segmentAbstract(abstract, tsv): ...@@ -101,6 +112,35 @@ def segmentAbstract(abstract, tsv):
return tsv return tsv
def segmentAbstract2(abstract, tsv, author, title):
year = str(date.today().year)
month = "1"
day = "1"
source = title
nbLines = len(abstract)
n = 0
count = 1
languages = {}
while n < nbLines - 2:
doc = "".join(abstract[n: n + 9]).replace("�", "")
title = source + " : Part " + str(count)
tsv += correctedSequence(author, False) + "\t" + correctedSequence(
source, False) + "\t" + year + "\t" + month + "\t" + day + "\t"
tsv += correctedSequence(title, False) + "\t"
tsv += correctedSequence(doc, True)
if tsv[-1] != "\n":
tsv += "\n"
n += 7
count += 1
if n > nbLines - 9 and n != nbLines - 2:
n = nbLines - 9
languages = detectLanguages(doc, languages)
st.session_state.txtLanguages[fileName] = detectMultipleLanguages(
languages, source)
return tsv
def correctedSequence(text, last): def correctedSequence(text, last):
tmp = text.replace("\"", "\"\"") tmp = text.replace("\"", "\"\"")
find = "\t" in text or "\"" in text or "\n" in text find = "\t" in text or "\"" in text or "\n" in text
...@@ -115,16 +155,31 @@ def correctedSequence(text, last): ...@@ -115,16 +155,31 @@ def correctedSequence(text, last):
def getTxt(): def getTxt():
txt = [] txt = []
st.session_state.pdfDir = tempfile.TemporaryDirectory() st.session_state.tmpDir = tempfile.TemporaryDirectory()
name = st.session_state.file.name name = st.session_state.file.name
with open(st.session_state.pdfDir.name + "/" + name, "wb") as file: with open(st.session_state.tmpDir.name + "/" + name, "wb") as file:
file.write(st.session_state.file.getvalue()) file.write(st.session_state.file.getvalue())
with open(st.session_state.pdfDir.name + "/" + name, "r") as file: with open(st.session_state.tmpDir.name + "/" + name, "r") as file:
for line in file: for line in file:
txt.append(line) txt.append(line)
return txt return txt
def getTxt2(fileAddress):
txt = []
with open(fileAddress, "r") as file:
for line in file:
txt.append(line)
return txt
def getInfo():
title = st.session_state.title
if title == "":
title = st.session_state.fileName.replace(".txt", "")
return st.session_state.author, title
def txtToTSV(): def txtToTSV():
fileName = st.session_state.file.name fileName = st.session_state.file.name
...@@ -135,6 +190,32 @@ def txtToTSV(): ...@@ -135,6 +190,32 @@ def txtToTSV():
return tsv return tsv
def txtToTSV2(fileName, fileAddress, txtDir):
st.session_state.page = 1
author, title = getInfo()
abstract = getTxt2(fileAddress)
tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract\n"
with st.spinner(st.session_state.general_text_dict['loading']):
tsv = segmentAbstract2(abstract, tsv, author, title)
if '/' in fileName:
fileName = fileName.split('/')[1]
with open(txtDir + "/" + fileName.replace(".txt", "(txt).tsv"), "w", encoding="utf-8-sig") as file:
file.write(tsv)
tsv = "\n".join(tsv.split("\n")[1:])
return tsv
def extractAllTXT(zipDir, zipFile):
with zipfile.ZipFile(zipFile) as zipRef:
zipInfos = zipRef.infolist()
for info in zipInfos:
while '/' in info.filename and len(info.filename.split('/')) > 1:
info.filename = "/".join(info.filename.split('/')[1:])
if ".txt" in info.filename:
zipRef.extract(info, zipDir)
# Tool Code End # Tool Code End
...@@ -148,20 +229,45 @@ if 'page' not in st.session_state: ...@@ -148,20 +229,45 @@ if 'page' not in st.session_state:
if 'warning' not in st.session_state: if 'warning' not in st.session_state:
st.session_state.warning = "" st.session_state.warning = ""
if 'submit' not in st.session_state:
st.session_state.submit = False
if 'zipSubmit' not in st.session_state:
st.session_state.zipSubmit = False
if 'txtLanguages' not in st.session_state:
st.session_state.txtLanguages = {}
def setSubmit(): def setSubmit():
st.session_state.submit = True st.session_state.submit = True
def setPage(): def setZIPSubmit():
st.session_state.zipSubmit = True
def resetPage():
st.session_state.page = 0 st.session_state.page = 0
def upPage():
st.session_state.page = 3
def uploadZip():
with st.form("Submit2"):
st.write(st.session_state.general_text_dict['text2'])
st.file_uploader(
st.session_state.general_text_dict['file'], type=["zip"], key='zipFile')
st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=setZIPSubmit)
def askTXT(): def askTXT():
with st.form("Submit"): with st.form("Submit"):
st.write(st.session_state.general_text_dict['text']) st.write(st.session_state.general_text_dict['text'])
st.write(st.session_state.general_text_dict['text2'])
st.write(st.session_state.general_text_dict['text3']) st.write(st.session_state.general_text_dict['text3'])
col1, col2 = st.columns(2) col1, col2 = st.columns(2)
st.session_state.author = "" st.session_state.author = ""
...@@ -170,28 +276,58 @@ def askTXT(): ...@@ -170,28 +276,58 @@ def askTXT():
st.session_state.general_text_dict['author'], key='author') st.session_state.general_text_dict['author'], key='author')
with col2: with col2:
st.text_input( st.text_input(
st.session_state.general_text_dict['titlePDF'], key='title') st.session_state.general_text_dict['titleTXT'], key='title')
st.file_uploader( st.file_uploader(
st.session_state.general_text_dict['file'], type=["txt"], key='file') st.session_state.general_text_dict['file'], type=["txt"], key='file')
st.form_submit_button( st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=setSubmit()) st.session_state.general_text_dict['submit'], on_click=setSubmit)
def askTXT2(fileName):
with st.form("Submit"):
st.write(fileName)
st.write(st.session_state.general_text_dict['text3'])
col1, col2 = st.columns(2)
st.session_state.author = ""
with col1:
st.text_input(
st.session_state.general_text_dict['author'], key='author')
with col2:
st.text_input(
st.session_state.general_text_dict['titleTXT'], key='title')
st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=upPage)
# Page Code End # Page Code End
if 'submit' not in st.session_state:
st.session_state.submit = False
if st.session_state.page == 0: if st.session_state.page == 0:
if st.session_state.submit: if st.session_state.submit:
st.session_state.submit = False st.session_state.submit = False
if st.session_state.file != None: if st.session_state.file != None:
print(st.session_state.file)
st.session_state.page = 1 st.session_state.page = 1
else: else:
askTXT() askTXT()
uploadZip()
elif st.session_state.zipSubmit:
st.session_state.zipSubmit = False
if st.session_state.zipFile != None:
st.session_state.zipDir = tempfile.TemporaryDirectory()
st.session_state.txtDir = tempfile.TemporaryDirectory()
st.session_state.tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract"
extractAllTXT(st.session_state.zipDir.name,
st.session_state.zipFile)
st.session_state.len = len(
os.listdir(st.session_state.zipDir.name))
st.session_state.nbDoc = 0
st.session_state.page = 2
else:
askTXT()
uploadZip()
else: else:
askTXT() askTXT()
uploadZip()
if st.session_state.page == 1: if st.session_state.page == 1:
name = st.session_state.file.name name = st.session_state.file.name
...@@ -201,4 +337,35 @@ if st.session_state.page == 1: ...@@ -201,4 +337,35 @@ if st.session_state.page == 1:
st.session_state.submit = False st.session_state.submit = False
if st.session_state.warning != "": if st.session_state.warning != "":
st.info(st.session_state.warning) st.info(st.session_state.warning)
st.download_button(name, txt, name, on_click=setPage()) st.download_button(name, txt, name, on_click=resetPage)
if st.session_state.page == 3:
fileName = st.session_state.fileName
tmp = txtToTSV2(
fileName, st.session_state.zipDir.name + '/' + fileName, st.session_state.txtDir.name)
st.session_state.tsv += "\n" + tmp
if st.session_state.nbDoc == st.session_state.len - 1:
st.session_state.page = 4
else:
st.session_state.nbDoc += 1
st.session_state.page = 2
if st.session_state.page == 2:
fileName = os.listdir(st.session_state.zipDir.name)[st.session_state.nbDoc]
st.session_state.fileName = fileName
if '/' in fileName:
fileName = fileName.split('/')[1]
askTXT2(fileName)
if st.session_state.page == 4:
with open(st.session_state.txtDir.name + "/TXTCompilation.tsv", "w", encoding='utf-8-sig') as file:
file.write(st.session_state.tsv)
shutil.make_archive(st.session_state.zipDir.name +
"/TXTCompilation", 'zip', st.session_state.txtDir.name)
with open(st.session_state.zipDir.name + "/TXTCompilation.zip", 'rb') as zip:
if st.session_state.warning != "":
st.info(st.session_state.warning)
detectMultipleTxtLanguages()
st.write(st.session_state.general_text_dict['new_file'])
st.download_button("TXTCompilation.zip",
zip, "TXTCompilation.zip", on_click=resetPage)
...@@ -4,11 +4,6 @@ Nicolas Atrax ...@@ -4,11 +4,6 @@ Nicolas Atrax
""" """
import streamlit as st import streamlit as st
import pandas as pd
import chardet
import re
import codecs
import os
import tempfile import tempfile
import shutil import shutil
from datetime import date from datetime import date
...@@ -17,6 +12,8 @@ from lib.youtubetranscript.youtube_transcript_api import YouTubeTranscriptApi ...@@ -17,6 +12,8 @@ from lib.youtubetranscript.youtube_transcript_api import YouTubeTranscriptApi
from lib.youtubetranscript.youtube_transcript_api._transcripts import NoTranscriptFound from lib.youtubetranscript.youtube_transcript_api._transcripts import NoTranscriptFound
from lib.youtubetranscript.youtube_transcript_api._transcripts import TranscriptsDisabled from lib.youtubetranscript.youtube_transcript_api._transcripts import TranscriptsDisabled
import src.basic as tmp import src.basic as tmp
import time
import random
tmp.base("YTBtoTSV") tmp.base("YTBtoTSV")
...@@ -24,7 +21,11 @@ tmp.base("YTBtoTSV") ...@@ -24,7 +21,11 @@ tmp.base("YTBtoTSV")
def ytbSearch(search, n): def ytbSearch(search, n):
videosSearch = VideosSearch(search) if st.session_state.videoLang == 'fr':
region = 'FR'
else:
region = 'US'
videosSearch = VideosSearch(search, region=region)
result = videosSearch.result()["result"] result = videosSearch.result()["result"]
videos = [] videos = []
while len(videos) < n: while len(videos) < n:
...@@ -35,8 +36,11 @@ def ytbSearch(search, n): ...@@ -35,8 +36,11 @@ def ytbSearch(search, n):
videos.append([id, author, title]) videos.append([id, author, title])
if len(videos) == n: if len(videos) == n:
break break
if len(videos) == n:
break
tmpResult = result tmpResult = result
videosSearch.next() videosSearch.next()
time.sleep(1.0)
result = videosSearch.result()["result"] result = videosSearch.result()["result"]
if result == tmpResult: if result == tmpResult:
break break
...@@ -44,27 +48,37 @@ def ytbSearch(search, n): ...@@ -44,27 +48,37 @@ def ytbSearch(search, n):
def getLang(list): def getLang(list):
tmp = ""
for lang in list: for lang in list:
return str(lang).split(" ")[0] tmp = str(lang).split(" ")[0]
if tmp == st.session_state.videoLang:
break
return tmp
def translatedTranscript(lang, lst, title, manual): def translateTranscript(lst, lang):
if lang != "en": origin = lst.find_transcript([lang])
res = lst.find_transcript([lang]) manual = not origin.is_generated
trans = res.translate("en").fetch() return origin.translate(st.session_state.videoLang).fetch(), manual
return trans
return lst.find_transcript([lang]).fetch()
def ytbTranscript(id, title): def ytbTranscript(id, title):
try: try:
transcriptList = YouTubeTranscriptApi.list_transcripts(id) transcriptList = YouTubeTranscriptApi.list_transcripts(id)
lang = getLang(transcriptList) lang = getLang(transcriptList)
if lang != st.session_state.videoLang:
return translateTranscript(transcriptList, lang)
try: try:
transcriptList.find_manually_created_transcript([lang]) transcript = transcriptList.find_manually_created_transcript(
return translatedTranscript(lang, transcriptList, title, True), True [st.session_state.videoLang]).fetch()
return transcript, True
except NoTranscriptFound: except NoTranscriptFound:
return translatedTranscript(lang, transcriptList, title, False), False try:
transcript = transcriptList.find_generated_transcript(
[st.session_state.videoLang]).fetch()
return transcript, False
except NoTranscriptFound:
return None, False
except TranscriptsDisabled: except TranscriptsDisabled:
return None, False return None, False
...@@ -133,6 +147,12 @@ def transcriptManualToDoc(transcript, author, title, date): ...@@ -133,6 +147,12 @@ def transcriptManualToDoc(transcript, author, title, date):
else: else:
tmp += text + " " tmp += text + " "
time += float(part["duration"]) time += float(part["duration"])
if time >= 20:
tsv = tsvAdd(tsv, tmp, author, title, date, count)
tmp = ""
time = 0
count += 1
with open(st.session_state.zipDir.name + "/" + title + ".tsv", "w", encoding="utf-8-sig") as file: with open(st.session_state.zipDir.name + "/" + title + ".tsv", "w", encoding="utf-8-sig") as file:
file.write(tsv) file.write(tsv)
tsv = "\n".join(tsv.split("\n")[1:]) tsv = "\n".join(tsv.split("\n")[1:])
...@@ -170,15 +190,19 @@ def transcriptToTsv(search, nbVideos): ...@@ -170,15 +190,19 @@ def transcriptToTsv(search, nbVideos):
dict = st.session_state.general_text_dict dict = st.session_state.general_text_dict
with st.spinner(dict['loadingID']): with st.spinner(dict['loadingID']):
if st.session_state.manualOnly: if st.session_state.manualOnly:
videos = ytbSearch(search, nbVideos * 20) videos = ytbSearch(search, nbVideos * 15)
else: else:
videos = ytbSearch(search, nbVideos * 4) videos = ytbSearch(search, nbVideos * 5)
count = 0 count = 0
countManual = 0 countTotal = 0
bar = st.progress(count / nbVideos, dict['loading'] + bar = st.progress(count / nbVideos, dict['loading'] +
str(count) + dict['quantity'] + str(nbVideos)) str(count) + dict['quantity'] + str(nbVideos))
for video in videos: for video in videos:
print(count) countTotal += 1
waitingTime = random.uniform(2.0, 7.0)
# print("Waiting time : " + str(waitingTime))
time.sleep(waitingTime)
# print(countTotal)
bar.progress(count / nbVideos, dict['loading'] + bar.progress(count / nbVideos, dict['loading'] +
str(count) + dict['quantity'] + str(nbVideos)) str(count) + dict['quantity'] + str(nbVideos))
if count == nbVideos: if count == nbVideos:
...@@ -191,7 +215,6 @@ def transcriptToTsv(search, nbVideos): ...@@ -191,7 +215,6 @@ def transcriptToTsv(search, nbVideos):
continue continue
transcript = correctTranscript(transcript) transcript = correctTranscript(transcript)
if manual: if manual:
countManual += 1
tsv += transcriptManualToDoc(transcript, tsv += transcriptManualToDoc(transcript,
author, title, str(date.today().year)) author, title, str(date.today().year))
count += 1 count += 1
...@@ -226,14 +249,17 @@ def resetPage(): ...@@ -226,14 +249,17 @@ def resetPage():
def askVideos(): def askVideos():
with st.form("Submit"): with st.form("Submit"):
st.write(st.session_state.general_text_dict['text']) st.write(st.session_state.general_text_dict['text'])
st.write(st.session_state.general_text_dict['text2'])
st.selectbox(st.session_state.general_text_dict['videoLang'], ['fr', 'en'],
key='videoLang')
st.text_input( st.text_input(
st.session_state.general_text_dict['keywords'], key='keywords') st.session_state.general_text_dict['keywords'], key='keywords')
st.slider( st.slider(
st.session_state.general_text_dict['number'], 1, 30, key='nb_taken') st.session_state.general_text_dict['number'], 1, 20, key='nb_taken')
st.checkbox( st.checkbox(
st.session_state.general_text_dict['fill'], key='manualOnly') st.session_state.general_text_dict['fill'], key='manualOnly')
st.form_submit_button( st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=setSubmit()) st.session_state.general_text_dict['submit'], on_click=setSubmit)
# Page Code End # Page Code End
...@@ -258,4 +284,4 @@ if st.session_state.page == 1: ...@@ -258,4 +284,4 @@ if st.session_state.page == 1:
with open(compilName + ".zip", 'rb') as zip: with open(compilName + ".zip", 'rb') as zip:
st.write(st.session_state.general_text_dict['new_file']) st.write(st.session_state.general_text_dict['new_file'])
st.download_button(st.session_state.keywords + ".zip", st.download_button(st.session_state.keywords + ".zip",
zip, st.session_state.keywords + ".zip", on_click=resetPage()) zip, st.session_state.keywords + ".zip", on_click=resetPage)
...@@ -6,7 +6,7 @@ Loïc Chapron ...@@ -6,7 +6,7 @@ Loïc Chapron
import streamlit as st import streamlit as st
import requests as req import requests as req
import json import json
from datetime import date from datetime import date, datetime
import src.basic as tmp import src.basic as tmp
...@@ -226,6 +226,7 @@ if st.session_state.stage == 0: ...@@ -226,6 +226,7 @@ if st.session_state.stage == 0:
# Form # Form
form = st.form('api') form = st.form('api')
lst = ['items', 'collections'] lst = ['items', 'collections']
st.session_state.id = form.text_input( st.session_state.id = form.text_input(
'ID', st.session_state.id, key='idForm', help=st.session_state.general_text_dict['help']) 'ID', st.session_state.id, key='idForm', help=st.session_state.general_text_dict['help'])
...@@ -307,7 +308,8 @@ if st.session_state.stage == 2 and st.session_state.format == 'collections': ...@@ -307,7 +308,8 @@ if st.session_state.stage == 2 and st.session_state.format == 'collections':
output = createTSVfromCollections() output = createTSVfromCollections()
st.write(st.session_state.general_text_dict['fileTSV1'] + str( st.write(st.session_state.general_text_dict['fileTSV1'] + str(
len(output.split('\n'))-2) + st.session_state.general_text_dict['fileTSV2']) len(output.split('\n'))-2) + st.session_state.general_text_dict['fileTSV2'])
st.download_button('Download TSV', output, 'output.csv') fileName = "zoteroOutput_" + str(datetime.now().strftime("%Y-%m-%d_%H:%M:%S")) + '.csv'
st.download_button('Download TSV', output, fileName)
if st.session_state.stage > 0: if st.session_state.stage > 0:
......
...@@ -5,6 +5,11 @@ from st_pages import show_pages_from_config, add_indentation ...@@ -5,6 +5,11 @@ from st_pages import show_pages_from_config, add_indentation
def base(page): def base(page):
st.set_page_config(
page_title="GarganTools | " + page,
page_icon="img/isc-pif_logo.png",
)
st.markdown( st.markdown(
f''' f'''
<style> <style>
...@@ -56,10 +61,11 @@ def base(page): ...@@ -56,10 +61,11 @@ def base(page):
show_pages_from_config() show_pages_from_config()
elif st.session_state.general_session_page != page: elif st.session_state.general_session_page != page:
st.session_state.general_text_dict = load_bundle(st.session_state.general_language) st.session_state.general_text_dict = load_bundle(
st.session_state.general_language)
st.session_state.general_session_page = page st.session_state.general_session_page = page
show_pages_from_config() show_pages_from_config()
# Delete every key who aren't fron this file # Delete every key who aren't fron this file
for key in st.session_state.keys(): for key in st.session_state.keys():
if 'general_' not in key: if 'general_' not in key:
...@@ -67,14 +73,12 @@ def base(page): ...@@ -67,14 +73,12 @@ def base(page):
add_indentation() add_indentation()
# select the lang # select the lang
coltitle,col = st.columns([4,1]) coltitle, col = st.columns([4, 1])
with coltitle: with coltitle:
st.write(st.session_state.general_text_dict['title']) st.write(st.session_state.general_text_dict['title'])
with col: with col:
_,col1, col2 = st.columns([1,1,1]) _, col1, col2 = st.columns([1, 1, 1])
with col1: with col1:
st.button(':fr:', on_click=update_lang, args=('fr',)) st.button(':fr:', on_click=update_lang, args=('fr',))
with col2: with col2:
st.button(':us:', on_click=update_lang, args=('en',)) st.button(':us:', on_click=update_lang, args=('en',))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment