fschwartzer commited on
Commit
96bff79
1 Parent(s): 604f418

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -33
app.py CHANGED
@@ -1,11 +1,21 @@
1
  import streamlit as st
2
  import pandas as pd
3
  from transformers import BartForConditionalGeneration, TapexTokenizer, T5ForConditionalGeneration, T5Tokenizer
 
4
  import datetime
5
  import sentencepiece as spm
6
 
 
 
 
 
 
 
 
 
 
7
  # File upload interface
8
- uploaded_file = st.file_uploader("Carregue um arquivo CSV ou XLSX", type=['csv', 'xlsx'])
9
 
10
  if uploaded_file:
11
  # Load the file into a DataFrame
@@ -14,15 +24,70 @@ if uploaded_file:
14
  elif uploaded_file.name.endswith('.xlsx'):
15
  df = pd.read_excel(uploaded_file)
16
 
17
- df.rename(columns={"ds": "datetime", "real": "monetary value", "Explicação": "explanation"}, inplace=True)
18
- df.sort_values(by=['datetime', 'monetary value'], ascending=False, inplace=True)
19
- df = df[df['monetary value'] >= 10000000.]
20
- df['monetary value'] = df['monetary value'].apply(lambda x: f"{x:.2f}")
21
- df = df.fillna('').astype(str)
22
- table_data = df
23
-
24
- # Display the uploaded table
25
- st.dataframe(table_data.head())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  # Load translation models
28
  pt_en_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-pt-en-t5")
@@ -40,59 +105,39 @@ if uploaded_file:
40
  return translated_text
41
 
42
  def response(user_question, table_data):
43
- # Traduz a pergunta para o inglês
44
  question_en = translate(user_question, pt_en_translator, tokenizer, source_lang="pt", target_lang="en")
45
- print(question_en)
46
-
47
- # Gera a resposta em inglês
48
  encoding = tapex_tokenizer(table=table_data, query=[question_en], padding=True, return_tensors="pt", truncation=True)
49
  outputs = tapex_model.generate(**encoding)
50
  response_en = tapex_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
51
- print(response_en)
52
-
53
- # Traduz a resposta para o português
54
  response_pt = translate(response_en, en_pt_translator, tokenizer, source_lang="en", target_lang="pt")
55
  return response_pt
56
 
57
  # Streamlit interface
58
 
59
- st.markdown("""
60
- <div style='display: flex; align-items: center;'>
61
- <div style='width: 20px; height: 20px; background-color: green; border-radius: 50%; margin-right: 2px;'></div>
62
- <div style='width: 20px; height: 20px; background-color: red; border-radius: 50%; margin-right: 2px;'></div>
63
- <div style='width: 20px; height: 20px; background-color: yellow; border-radius: 50%; margin-right: 10px;'></div>
64
- <span style='font-size: 40px; font-weight: bold;'>Chatbot do Tesouro RS</span>
65
- </div>
66
- """, unsafe_allow_html=True)
67
 
68
  # Chat history
69
  if 'history' not in st.session_state:
70
  st.session_state['history'] = []
71
 
72
- # Input box for user question
73
  user_question = st.text_input("Escreva sua questão aqui:", "")
74
 
75
  if user_question:
76
- # Add human emoji when user asks a question
77
  st.session_state['history'].append(('👤', user_question))
78
  st.markdown(f"**👤 {user_question}**")
79
 
80
- # Generate the response
81
- bot_response = response(user_question, table_data)
82
 
83
- # Add robot emoji when generating response and align to the right
84
  st.session_state['history'].append(('🤖', bot_response))
85
  st.markdown(f"<div style='text-align: right'>**🤖 {bot_response}**</div>", unsafe_allow_html=True)
86
 
87
- # Clear history button
88
  if st.button("Limpar"):
89
  st.session_state['history'] = []
90
 
91
- # Display chat history
92
  for sender, message in st.session_state['history']:
93
  if sender == '👤':
94
  st.markdown(f"**👤 {message}**")
95
  elif sender == '🤖':
96
  st.markdown(f"<div style='text-align: right'>**🤖 {message}**</div>", unsafe_allow_html=True)
97
  else:
98
- st.warning("Carregue um arquivo CSV ou XLSX para começar.")
 
1
  import streamlit as st
2
  import pandas as pd
3
  from transformers import BartForConditionalGeneration, TapexTokenizer, T5ForConditionalGeneration, T5Tokenizer
4
+ from prophet import Prophet
5
  import datetime
6
  import sentencepiece as spm
7
 
8
+ st.markdown("""
9
+ <div style='display: flex; align-items: center;'>
10
+ <div style='width: 20px; height: 20px; background-color: green; border-radius: 50%; margin-right: 2px;'></div>
11
+ <div style='width: 20px; height: 20px; background-color: red; border-radius: 50%; margin-right: 2px;'></div>
12
+ <div style='width: 20px; height: 20px; background-color: yellow; border-radius: 50%; margin-right: 10px;'></div>
13
+ <span style='font-size: 40px; font-weight: bold;'>Chatbot do Tesouro RS</span>
14
+ </div>
15
+ """, unsafe_allow_html=True)
16
+
17
  # File upload interface
18
+ uploaded_file = st.file_uploader("Upload a CSV or XLSX file", type=['csv', 'xlsx'])
19
 
20
  if uploaded_file:
21
  # Load the file into a DataFrame
 
24
  elif uploaded_file.name.endswith('.xlsx'):
25
  df = pd.read_excel(uploaded_file)
26
 
27
+ # Data preprocessing for Prophet
28
+ new_df = df.iloc[2:, 9:-1].fillna(0)
29
+ new_df.columns = df.iloc[1, 9:-1]
30
+ new_df.columns = new_df.columns.str.replace(r" \(\d+\)", "", regex=True)
31
+
32
+ month_dict = {
33
+ 'Jan': '01', 'Fev': '02', 'Mar': '03', 'Abr': '04',
34
+ 'Mai': '05', 'Jun': '06', 'Jul': '07', 'Ago': '08',
35
+ 'Set': '09', 'Out': '10', 'Nov': '11', 'Dez': '12'
36
+ }
37
+
38
+ def convert_column_name(column_name):
39
+ if column_name == 'Rótulos de Linha':
40
+ return column_name
41
+ parts = column_name.split('/')
42
+ month = parts[0].strip()
43
+ year = parts[1].strip()
44
+ year = ''.join(filter(str.isdigit, year))
45
+ month_number = month_dict.get(month, '00')
46
+ return f"{month_number}/{year}"
47
+
48
+ new_df.columns = [convert_column_name(col) for col in new_df.columns]
49
+ new_df.columns = pd.to_datetime(new_df.columns, errors='coerce')
50
+ new_df.rename(columns={new_df.columns[0]: 'Rotulo'}, inplace=True)
51
+ df_clean = new_df.copy()
52
+
53
+ # Create an empty DataFrame to store all anomalies
54
+ all_anomalies = pd.DataFrame()
55
+
56
+ # Process each row in the DataFrame
57
+ for index, row in df_clean.iterrows():
58
+ data = pd.DataFrame({
59
+ 'ds': [col for col in df_clean.columns if isinstance(col, pd.Timestamp)],
60
+ 'y': row[[isinstance(col, pd.Timestamp) for col in df_clean.columns]].values
61
+ })
62
+
63
+ data = data[data['y'] > 0].reset_index(drop=True)
64
+ if data.empty or len(data) < 2:
65
+ print(f"Skipping group {row['Rotulo']} because there are less than 2 non-zero observations.")
66
+ continue
67
+
68
+ try:
69
+ model = Prophet(interval_width=0.95)
70
+ model.fit(data)
71
+ except ValueError as e:
72
+ print(f"Skipping group {row['Rotulo']} due to error: {e}")
73
+ continue
74
+
75
+ future = model.make_future_dataframe(periods=12, freq='M')
76
+ forecast = model.predict(future)
77
+
78
+ num_real = len(data)
79
+ num_forecast = len(forecast)
80
+ real_values = list(data['y']) + [None] * (num_forecast - num_real)
81
+ forecast['real'] = real_values
82
+ anomalies = forecast[(forecast['real'] < forecast['yhat_lower']) | (forecast['real'] > forecast['yhat_upper'])]
83
+
84
+ anomalies['Group'] = row['Rotulo']
85
+ all_anomalies = pd.concat([all_anomalies, anomalies[['ds', 'real', 'Group']]], ignore_index=True)
86
+
87
+ # Preparing anomalies DataFrame for TAPEX model
88
+ all_anomalies.rename(columns={"ds": "datetime", "real": "monetary value", "Group": "explanation"}, inplace=True)
89
+ all_anomalies['monetary value'] = all_anomalies['monetary value'].apply(lambda x: f"{x:.2f}")
90
+ all_anomalies = all_anomalies.fillna('').astype(str)
91
 
92
  # Load translation models
93
  pt_en_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-pt-en-t5")
 
105
  return translated_text
106
 
107
  def response(user_question, table_data):
 
108
  question_en = translate(user_question, pt_en_translator, tokenizer, source_lang="pt", target_lang="en")
 
 
 
109
  encoding = tapex_tokenizer(table=table_data, query=[question_en], padding=True, return_tensors="pt", truncation=True)
110
  outputs = tapex_model.generate(**encoding)
111
  response_en = tapex_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
 
 
 
112
  response_pt = translate(response_en, en_pt_translator, tokenizer, source_lang="en", target_lang="pt")
113
  return response_pt
114
 
115
  # Streamlit interface
116
 
117
+ st.dataframe(all_anomalies.head())
 
 
 
 
 
 
 
118
 
119
  # Chat history
120
  if 'history' not in st.session_state:
121
  st.session_state['history'] = []
122
 
 
123
  user_question = st.text_input("Escreva sua questão aqui:", "")
124
 
125
  if user_question:
 
126
  st.session_state['history'].append(('👤', user_question))
127
  st.markdown(f"**👤 {user_question}**")
128
 
129
+ bot_response = response(user_question, all_anomalies)
 
130
 
 
131
  st.session_state['history'].append(('🤖', bot_response))
132
  st.markdown(f"<div style='text-align: right'>**🤖 {bot_response}**</div>", unsafe_allow_html=True)
133
 
 
134
  if st.button("Limpar"):
135
  st.session_state['history'] = []
136
 
 
137
  for sender, message in st.session_state['history']:
138
  if sender == '👤':
139
  st.markdown(f"**👤 {message}**")
140
  elif sender == '🤖':
141
  st.markdown(f"<div style='text-align: right'>**🤖 {message}**</div>", unsafe_allow_html=True)
142
  else:
143
+ st.warning("Please upload a CSV or XLSX file to start.")