""" Streamlit Application Nicolas Atrax """ import streamlit as st import pandas as pd import chardet import csv import re from datetime import date import codecs import tempfile import src.basic as tmp tmp.base("CSVHarzingToTSV") # Tool Code Start def getSeparator(): line = st.session_state.file.readline().decode('utf-8-sig') if ',' in line: return ',' if ';' in line: return ';' return '\t' def correctedSequence(text, separator): tmp = text.replace("\"", "\"\"") find = separator in text or "\"" in text or "\n" in text if find: if text[len(text) - 1] == "\n": tmp = tmp[:-1] tmp = "\"" + tmp + "\"" return tmp def getValues(items): authors = items['Authors'] title = items["Title"] source = items["Source"] abstract = items["Abstract"] year = items["Year"] return authors, title, source, abstract, year def tsvAddOthersColumns(tsv, separator): reader = csv.DictReader(codecs.iterdecode( st.session_state.file, 'utf-8'), delimiter=separator) for row in reader: for name, value in row.items(): name = name.replace("\ufeff", "") if name not in ["Authors", "Title", "Source", "Abstract", "Year"]: tsv += "\t" + correctedSequence(name, separator) tsv += "\n" break return tsv def tsvAddOthers(tsv, items, separator): for name, value in items: if name not in ["Authors", "Title", "Source", "Abstract", "Year"]: tsv += "\t" + correctedSequence(value, separator) def getCleanItems(items): res = dict() for name, value in items: tmpName = name if name != "" and re.search('[a-zA-Z0-9]', name[0]) == None: tmpName = name[1:] if tmpName in ["Authors", "Title", "Source", "Abstract", "Year"]: res[tmpName] = value else: res[name] = value return res def HarzingToTsv(separator): tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract" tsv = tsvAddOthersColumns(tsv, separator) st.session_state.file.seek(0) reader = csv.DictReader(codecs.iterdecode( st.session_state.file, 'utf-8-sig'), delimiter=separator) for row in reader: tmp = "" first = True authors, title, source, abstract, year = getValues( getCleanItems(row.items())) tsv += correctedSequence(authors, separator) + "\t" + correctedSequence( source, separator) + "\t" + year + "\t" + "1" + "\t" + "1" + "\t" tsv += correctedSequence(title, separator) + "\t" last = len(row.items()) == 7 tsv += correctedSequence(abstract, separator) if last: tsv = tsvAddOthers(tsv, row.items(), separator) tsv += "\n" return tsv def getCSV(): st.session_state.pdfDir = tempfile.TemporaryDirectory() name = st.session_state.file.name with open(st.session_state.pdfDir.name + "/" + name, "wb") as file: file.write(st.session_state.file.getvalue()) # Tool Code End form = st.form('api') # Page Code Start if 'page' not in st.session_state: st.session_state.page = 0 def resetPage(): st.session_state.page = 0 # Page Code End if st.session_state.page == 0: st.write(st.session_state.general_text_dict['text']) file = st.file_uploader( st.session_state.general_text_dict['file'], type=["csv"], key='file') if file: st.session_state.page = 1 if st.session_state.page == 1: name = st.session_state.file.name separator = getSeparator() st.session_state.file.seek(0) tsv = HarzingToTsv(separator) name = st.session_state.file.name.split('.')[0] + '.tsv' st.write(st.session_state.general_text_dict['new_file']) st.download_button(name, tsv, name, on_click=resetPage)