import streamlit as st import pandas as pd import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas from reportlab.lib.units import inch from io import BytesIO from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch import chardet import os # Load model and tokenizer tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english") model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english") # Function to analyze sentiment def analyze_sentiment(text): inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): outputs = model(**inputs) scores = outputs.logits.softmax(dim=1) labels = ['NEGATIVE', 'POSITIVE'] score, label = torch.max(scores, dim=1) return {"label": labels[label.item()], "score": score.item()} # Function to detect file encoding def detect_encoding(file): rawdata = file.read() result = chardet.detect(rawdata) return result['encoding'] def generate_pdf(pie_chart_path, pos_wordcloud_path, neg_wordcloud_path): pdf_output = BytesIO() pdf_height = 16.5 * inch # Total vertical height calculated pdf_width = 8.27 * inch # A4 width c = canvas.Canvas(pdf_output, pagesize=(pdf_width, pdf_height)) # Set starting vertical position y_position = pdf_height - 1 * inch # Add title c.setFont("Helvetica-Bold", 20) c.drawString(2.2 * inch, y_position, "Sentiment Analysis Report") # Update vertical position after title y_position -= 2 * inch # Add pie chart with width 5 inches and height double the width pie_chart_width = 5 * inch pie_chart_height = 4 * inch c.drawImage(pie_chart_path, 1.5 * inch, y_position - pie_chart_height, width=pie_chart_width, height=pie_chart_height) # Update vertical position after pie chart y_position -= (pie_chart_height + 1 * inch) # Add some spacing # Add Positive Keywords heading c.setFont("Helvetica-Bold", 12) c.drawString(3 * inch, y_position, "Positive Keywords") # Add positive word cloud c.drawImage(pos_wordcloud_path, 1 * inch, y_position - 3.3 * inch, width=6 * inch, height=3 * inch) # 2:1 ratio # Update vertical position after positive word cloud y_position -= (3 * inch + 1 * inch) # Add some spacing # Add Negative Keywords heading c.setFont("Helvetica-Bold", 12) c.drawString(3 * inch, y_position, "Negative Keywords") # Add negative word cloud c.drawImage(neg_wordcloud_path, 1 * inch, y_position - 3.3 * inch, width=6 * inch, height=3 * inch) # 2:1 ratio c.save() pdf_output.seek(0) return pdf_output # Streamlit UI st.title("Sentiment Analysis and Reporting") # Initialize session state for button visibility if 'show_pdf_download' not in st.session_state: st.session_state.show_pdf_download = False # Sidebar for encoding detection and reset button st.sidebar.header("File Encoding Checker") # File uploader in the sidebar uploaded_file = st.sidebar.file_uploader("Upload CSV file for Encoding Check", type=["csv"]) if uploaded_file: # Detect the encoding encoding = detect_encoding(uploaded_file) st.sidebar.write(f"Detected encoding: {encoding}") # Reset button in the sidebar if st.sidebar.button("Reset Analysis"): if os.path.exists("sentiment_pie_chart.png"): os.remove("sentiment_pie_chart.png") if os.path.exists("pos_wordcloud.png"): os.remove("pos_wordcloud.png") if os.path.exists("neg_wordcloud.png"): os.remove("neg_wordcloud.png") st.sidebar.write("Files deleted. Please re-upload a file to start over.") # File uploader for sentiment analysis uploaded_file = st.file_uploader("Upload CSV file for Sentiment Analysis", type=["csv"]) # Dropdown for encoding specification in the main panel encodings = ['utf-8', 'latin-1', 'ISO-8859-1', 'ASCII', 'UTF-16', 'UTF-32', 'ANSI', "Windows-1251", 'Windows-1252'] user_encoding = st.selectbox("Select Encoding", options=encodings, index=0) # Button to start processing if st.button("Go"): if uploaded_file: try: # Load the CSV file into DataFrame with specified encoding uploaded_file.seek(0) # Reset the file pointer to the beginning df = pd.read_csv(uploaded_file, encoding=user_encoding) except UnicodeDecodeError: st.error("Error decoding the file. Please specify the correct encoding.") else: # Check if the DataFrame has exactly one column if df.shape[1] != 1: st.warning("The CSV file should only contain one column with review data.") else: # Rename the column to 'review' df.columns = ['review'] # Clean up the DataFrame df['review'] = df['review'].astype(str).str.strip() df = df[df['review'].apply(len) <= 512] # Apply sentiment analysis df['sentiment'] = df['review'].apply(analyze_sentiment) df['sentiment_label'] = df['sentiment'].apply(lambda x: x['label']) df['sentiment_score'] = df['sentiment'].apply(lambda x: x['score']) # Drop the original 'sentiment' column df = df.drop(columns=['sentiment']) # Pie chart data sentiment_counts = df['sentiment_label'].value_counts() # Create pie chart fig, ax = plt.subplots() ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=45) ax.set_title('Distribution of Sentiment') pie_chart_path = "sentiment_pie_chart.png" plt.savefig(pie_chart_path) # Create word clouds stopwords = set(STOPWORDS) pos_reviews = df[df['sentiment_label'] == 'POSITIVE']['review'].str.cat(sep=' ') neg_reviews = df[df['sentiment_label'] == 'NEGATIVE']['review'].str.cat(sep=' ') pos_wordcloud = WordCloud(max_font_size=80, max_words=10, background_color='white', stopwords=stopwords).generate(pos_reviews) neg_wordcloud = WordCloud(max_font_size=80, max_words=10, background_color='white', stopwords=stopwords).generate(neg_reviews) # Save word clouds to files pos_wordcloud_path = "pos_wordcloud.png" neg_wordcloud_path = "neg_wordcloud.png" pos_wordcloud.to_file(pos_wordcloud_path) neg_wordcloud.to_file(neg_wordcloud_path) # Create PDF pdf_output = generate_pdf(pie_chart_path, pos_wordcloud_path, neg_wordcloud_path) # Display options st.write("Processing complete!") # Update session state to show the appropriate buttons st.session_state.show_pdf_download = True # Display buttons download_pdf = st.download_button("Download PDF Report", pdf_output, file_name="sentiment_analysis_report.pdf", mime="application/pdf") else: st.info("Please upload a CSV file to get started.")