Spaces:

fschwartzer
/

streamlit_chatbot

Running

App Files Files Community

streamlit_chatbot / app.py

fschwartzer

Update app.py

96bff79 verified 7 days ago

raw

history blame

No virus

6.31 kB

	import streamlit as st
	import pandas as pd
	from transformers import BartForConditionalGeneration, TapexTokenizer, T5ForConditionalGeneration, T5Tokenizer
	from prophet import Prophet
	import datetime
	import sentencepiece as spm

	st.markdown("""
	<div style='display: flex; align-items: center;'>
	<div style='width: 20px; height: 20px; background-color: green; border-radius: 50%; margin-right: 2px;'></div>
	<div style='width: 20px; height: 20px; background-color: red; border-radius: 50%; margin-right: 2px;'></div>
	<div style='width: 20px; height: 20px; background-color: yellow; border-radius: 50%; margin-right: 10px;'></div>
	<span style='font-size: 40px; font-weight: bold;'>Chatbot do Tesouro RS</span>
	</div>
	""", unsafe_allow_html=True)

	# File upload interface
	uploaded_file = st.file_uploader("Upload a CSV or XLSX file", type=['csv', 'xlsx'])

	if uploaded_file:
	# Load the file into a DataFrame
	if uploaded_file.name.endswith('.csv'):
	df = pd.read_csv(uploaded_file, quotechar='"', encoding='utf-8')
	elif uploaded_file.name.endswith('.xlsx'):
	df = pd.read_excel(uploaded_file)

	# Data preprocessing for Prophet
	new_df = df.iloc[2:, 9:-1].fillna(0)
	new_df.columns = df.iloc[1, 9:-1]
	new_df.columns = new_df.columns.str.replace(r" \(\d+\)", "", regex=True)

	month_dict = {
	'Jan': '01', 'Fev': '02', 'Mar': '03', 'Abr': '04',
	'Mai': '05', 'Jun': '06', 'Jul': '07', 'Ago': '08',
	'Set': '09', 'Out': '10', 'Nov': '11', 'Dez': '12'
	}

	def convert_column_name(column_name):
	if column_name == 'Rótulos de Linha':
	return column_name
	parts = column_name.split('/')
	month = parts[0].strip()
	year = parts[1].strip()
	year = ''.join(filter(str.isdigit, year))
	month_number = month_dict.get(month, '00')
	return f"{month_number}/{year}"

	new_df.columns = [convert_column_name(col) for col in new_df.columns]
	new_df.columns = pd.to_datetime(new_df.columns, errors='coerce')
	new_df.rename(columns={new_df.columns[0]: 'Rotulo'}, inplace=True)
	df_clean = new_df.copy()

	# Create an empty DataFrame to store all anomalies
	all_anomalies = pd.DataFrame()

	# Process each row in the DataFrame
	for index, row in df_clean.iterrows():
	data = pd.DataFrame({
	'ds': [col for col in df_clean.columns if isinstance(col, pd.Timestamp)],
	'y': row[[isinstance(col, pd.Timestamp) for col in df_clean.columns]].values
	})

	data = data[data['y'] > 0].reset_index(drop=True)
	if data.empty or len(data) < 2:
	print(f"Skipping group {row['Rotulo']} because there are less than 2 non-zero observations.")
	continue

	try:
	model = Prophet(interval_width=0.95)
	model.fit(data)
	except ValueError as e:
	print(f"Skipping group {row['Rotulo']} due to error: {e}")
	continue

	future = model.make_future_dataframe(periods=12, freq='M')
	forecast = model.predict(future)

	num_real = len(data)
	num_forecast = len(forecast)
	real_values = list(data['y']) + [None] * (num_forecast - num_real)
	forecast['real'] = real_values
	anomalies = forecast[(forecast['real'] < forecast['yhat_lower']) \| (forecast['real'] > forecast['yhat_upper'])]

	anomalies['Group'] = row['Rotulo']
	all_anomalies = pd.concat([all_anomalies, anomalies[['ds', 'real', 'Group']]], ignore_index=True)

	# Preparing anomalies DataFrame for TAPEX model
	all_anomalies.rename(columns={"ds": "datetime", "real": "monetary value", "Group": "explanation"}, inplace=True)
	all_anomalies['monetary value'] = all_anomalies['monetary value'].apply(lambda x: f"{x:.2f}")
	all_anomalies = all_anomalies.fillna('').astype(str)

	# Load translation models
	pt_en_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-pt-en-t5")
	en_pt_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-en-pt-t5")
	tokenizer = T5Tokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")

	# Load TAPEX model
	tapex_model = BartForConditionalGeneration.from_pretrained("microsoft/tapex-large-finetuned-wtq")
	tapex_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")

	def translate(text, model, tokenizer, source_lang="pt", target_lang="en"):
	input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
	outputs = model.generate(input_ids)
	translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return translated_text

	def response(user_question, table_data):
	question_en = translate(user_question, pt_en_translator, tokenizer, source_lang="pt", target_lang="en")
	encoding = tapex_tokenizer(table=table_data, query=[question_en], padding=True, return_tensors="pt", truncation=True)
	outputs = tapex_model.generate(**encoding)
	response_en = tapex_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
	response_pt = translate(response_en, en_pt_translator, tokenizer, source_lang="en", target_lang="pt")
	return response_pt

	# Streamlit interface

	st.dataframe(all_anomalies.head())

	# Chat history
	if 'history' not in st.session_state:
	st.session_state['history'] = []

	user_question = st.text_input("Escreva sua questão aqui:", "")

	if user_question:
	st.session_state['history'].append(('👤', user_question))
	st.markdown(f"👤 {user_question}")

	bot_response = response(user_question, all_anomalies)

	st.session_state['history'].append(('🤖', bot_response))
	st.markdown(f"<div style='text-align: right'>🤖 {bot_response}</div>", unsafe_allow_html=True)

	if st.button("Limpar"):
	st.session_state['history'] = []

	for sender, message in st.session_state['history']:
	if sender == '👤':
	st.markdown(f"👤 {message}")
	elif sender == '🤖':
	st.markdown(f"<div style='text-align: right'>🤖 {message}</div>", unsafe_allow_html=True)
	else:
	st.warning("Please upload a CSV or XLSX file to start.")