fschwartzer's picture
Update app.py
a09ca43 verified
raw
history blame
No virus
8.43 kB
import streamlit as st
import pandas as pd
from transformers import BartForConditionalGeneration, TapexTokenizer, T5ForConditionalGeneration, T5Tokenizer
from prophet import Prophet
# Abrindo e lendo o arquivo CSS
with open("style.css", "r") as css:
css_style = css.read()
# Markdown combinado com a importação da fonte e o HTML
html_content = f"""
<style>
{css_style}
@import url('https://fonts.googleapis.com/css2?family=Kanit:wght@700&display=swap');
</style>
<div style='display: flex; flex-direction: column; align-items: flex-start;'>
<div style='display: flex; align-items: center;'>
<div style='width: 20px; height: 5px; background-color: green; margin-right: 0px;'></div>
<div style='width: 20px; height: 5px; background-color: red; margin-right: 0px;'></div>
<div style='width: 20px; height: 5px; background-color: yellow; margin-right: 18px;'></div>
<span style='font-size: 38px; font-weight: normal; font-family: "Kanit", sans-serif;'>NOSTRADAMUS</span>
</div>
</div>
"""
# Aplicar o markdown combinado no Streamlit
st.markdown(html_content, unsafe_allow_html=True)
# Inicialização de variáveis de estado
if 'all_anomalies' not in st.session_state:
st.session_state['all_anomalies'] = pd.DataFrame()
if 'history' not in st.session_state:
st.session_state['history'] = []
# Carregar os modelos de tradução e TAPEX
pt_en_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-pt-en-t5")
en_pt_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-en-pt-t5")
tapex_model = BartForConditionalGeneration.from_pretrained("microsoft/tapex-large-finetuned-wtq")
tapex_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
tokenizer = T5Tokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")
def translate(text, model, tokenizer, source_lang="pt", target_lang="en"):
input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
outputs = model.generate(input_ids)
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return translated_text
def response(user_question, table_data):
question_en = translate(user_question, pt_en_translator, tokenizer, source_lang="pt", target_lang="en")
encoding = tapex_tokenizer(table=table_data, query=[question_en], padding=True, return_tensors="pt", truncation=True)
outputs = tapex_model.generate(**encoding)
response_en = tapex_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
response_pt = translate(response_en, en_pt_translator, tokenizer, source_lang="en", target_lang="pt")
return response_pt
def load_data(uploaded_file):
if uploaded_file.name.endswith('.csv'):
df = pd.read_csv(uploaded_file, quotechar='"', encoding='utf-8')
elif uploaded_file.name.endswith('.xlsx'):
df = pd.read_excel(uploaded_file)
return df
def preprocess_data(df):
new_df = df.iloc[2:,9:-1].fillna(0)
new_df.columns = df.iloc[1,9:-1]
new_df.columns = new_df.columns.str.replace(r" \(\d+\)", "", regex=True)
month_dict = {
'Jan': '01', 'Fev': '02', 'Mar': '03', 'Abr': '04',
'Mai': '05', 'Jun': '06', 'Jul': '07', 'Ago': '08',
'Set': '09', 'Out': '10', 'Nov': '11', 'Dez': '12'
}
def convert_column_name(column_name):
# Check if the column name is 'Rótulos de Linha'
if column_name == 'Rótulos de Linha':
return column_name
# Otherwise, proceed to convert
parts = column_name.split('/')
month = parts[0].strip()
year = parts[1].strip()
# Clean year in case there are extra characters
year = ''.join(filter(str.isdigit, year))
# Get month number from the dictionary
month_number = month_dict.get(month, '00') # Default '00' if month is not found
# Return formatted date string
return f"{month_number}/{year}"
new_df.columns = [convert_column_name(col) for col in new_df.columns]
new_df.columns = pd.to_datetime(new_df.columns, errors='coerce')
new_df.rename(columns={new_df.columns[0]: 'Rotulo'}, inplace=True)
df_clean = new_df.copy()
return df_clean
def apply_prophet(df_clean):
if df_clean.empty:
st.error("DataFrame está vazio após o pré-processamento.")
return pd.DataFrame()
# Debugging: Check structure of df_clean
st.write("Estrutura do DataFrame df_clean:")
st.write(df_clean)
# Criar um DataFrame vazio para armazenar todas as anomalias
all_anomalies = pd.DataFrame()
# Processar cada linha no DataFrame
for index, row in df_clean.iterrows():
# Extract timestamp and value columns
date_columns = [col for col in df_clean.columns if isinstance(col, pd.Timestamp)]
data = pd.DataFrame({
'ds': date_columns,
'y': row[date_columns].values
})
# Debugging: Check the data passed into Prophet
st.write(f"Dados para Prophet - Grupo {row['Rotulo']}:")
st.write(data)
# Remove rows where 'y' is zero or missing
data = data[data['y'] > 0].dropna().reset_index(drop=True)
# Ensure there's enough data for Prophet to run
if data.empty or len(data) < 2:
st.write(f"Pular grupo {row['Rotulo']} por não ter observações suficientes.")
continue
try:
# Create and fit the Prophet model
model = Prophet(interval_width=0.95)
model.fit(data)
except ValueError as e:
st.write(f"Pular grupo {row['Rotulo']} devido ao erro: {e}")
continue
# Make future predictions
future = model.make_future_dataframe(periods=12, freq='M')
forecast = model.predict(future)
# Add real values and calculate anomalies
real_values = list(data['y']) + [None] * (len(forecast) - len(data))
forecast['real'] = real_values
anomalies = forecast[(forecast['real'] < forecast['yhat_lower']) | (forecast['real'] > forecast['yhat_upper'])]
# Debugging: Check the anomalies detected
st.write(f"Anomalias detectadas para o grupo {row['Rotulo']}:")
st.write(anomalies)
# Add group label and append anomalies to all_anomalies DataFrame
anomalies['group'] = row['Rotulo']
all_anomalies = pd.concat([all_anomalies, anomalies[['ds', 'real', 'group']]], ignore_index=True)
# Return the dataframe of all anomalies
return all_anomalies
tab1, tab2 = st.tabs(["Meta Prophet", "Microsoft TAPEX"])
# Interface para carregar arquivo
uploaded_file = st.file_uploader("Carregue um arquivo CSV ou XLSX", type=['csv', 'xlsx'])
with tab1:
if uploaded_file:
df = load_data(uploaded_file)
df_clean = preprocess_data(df)
if df_clean.empty:
st.warning("Não há dados válidos para processar.")
else:
# Check if 'all_anomalies' is already in session state to avoid re-running Prophet
if 'all_anomalies' not in st.session_state:
with st.spinner('Aplicando modelo de série temporal...'):
all_anomalies = apply_prophet(df_clean)
st.session_state['all_anomalies'] = all_anomalies
with tab2:
# Ensure 'all_anomalies' exists in session state before allowing user interaction
if 'all_anomalies' in st.session_state and not st.session_state['all_anomalies'].empty:
# Interface para perguntas do usuário
user_question = st.text_input("Escreva sua questão aqui:", "")
if user_question:
bot_response = response(user_question, st.session_state['all_anomalies'])
st.session_state['history'].append(('👤', user_question))
st.session_state['history'].append(('🤖', bot_response))
# Mostrar histórico de conversa
for sender, message in st.session_state['history']:
if sender == '👤':
st.markdown(f"**👤 {message}**")
elif sender == '🤖':
st.markdown(f"**🤖 {message}**", unsafe_allow_html=True)
# Botão para limpar histórico
if st.button("Limpar histórico"):
st.session_state['history'] = []
else:
st.warning("Por favor, processe os dados no Meta Prophet primeiro.")