Commit be50ef3c authored by Nicolas Atrax's avatar Nicolas Atrax

Update PDF_to_TSV.py

parent 450d9bcf
......@@ -65,7 +65,7 @@ def estimateLanguagesPercentage(languages):
for l in languages:
total += int(languages[l])
for l in languages:
tmp = int(languages[l]) / total * 100
tmp = round(int(languages[l]) / total * 100, 1)
if tmp > max:
max = tmp
principal = l
......@@ -167,15 +167,20 @@ def correctedSequence(text, last):
def getInfo():
return st.session_state.author, st.session_state.title, st.session_state.watermark
title = st.session_state.title
if title == "":
title = st.session_state.fileName.replace(".pdf", "")
return st.session_state.author, title, st.session_state.watermark
def txtToTSV(fileName, fileAddress, pdfDir):
st.session_state.page = 1
author, title, watermark = getInfo()
tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract\n"
tsv, languages = segmentAbstract(fileName, fileAddress, tsv, author, title,
str(date.today().year), "1", "1", watermark)
with st.spinner(st.session_state.general_text_dict['loading']):
tsv, languages = segmentAbstract(fileName, fileAddress, tsv, author, title,
str(date.today().year), "1", "1", watermark)
if '/' in fileName:
fileName = fileName.split('/')[1]
with open(pdfDir + "/" + fileName.replace(".pdf", ".tsv"), "w", encoding="utf-8-sig") as file:
......@@ -224,6 +229,7 @@ def upPage():
def uploadZip():
st.write(st.session_state.general_text_dict['text'])
st.write(st.session_state.general_text_dict['text2'])
return st.file_uploader(
st.session_state.general_text_dict['file'], type=["zip"], key='file')
......@@ -231,6 +237,7 @@ def uploadZip():
def askPDF(fileName):
with st.form("Submit"):
st.write(fileName)
st.write(st.session_state.general_text_dict['text3'])
col1, col2 = st.columns(2)
st.session_state.author = ""
st.session_state.title = ""
......@@ -279,6 +286,7 @@ if st.session_state.page == 2:
if st.session_state.page == 1:
fileName = os.listdir(st.session_state.zipDir.name)[st.session_state.nbDoc]
st.session_state.fileName = fileName
if '/' in fileName:
fileName = fileName.split('/')[1]
askPDF(fileName)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment