File size: 7,362 Bytes
ee275ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch
from io import BytesIO
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import chardet
import os

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")

# Function to analyze sentiment
def analyze_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    scores = outputs.logits.softmax(dim=1)
    labels = ['NEGATIVE', 'POSITIVE']
    score, label = torch.max(scores, dim=1)
    return {"label": labels[label.item()], "score": score.item()}

# Function to detect file encoding
def detect_encoding(file):
    rawdata = file.read()
    result = chardet.detect(rawdata)
    return result['encoding']

def generate_pdf(pie_chart_path, pos_wordcloud_path, neg_wordcloud_path):
    pdf_output = BytesIO()
    pdf_height = 16.5 * inch  # Total vertical height calculated
    pdf_width = 8.27 * inch  # A4 width
    c = canvas.Canvas(pdf_output, pagesize=(pdf_width, pdf_height))

    # Set starting vertical position
    y_position = pdf_height - 1 * inch

    # Add title
    c.setFont("Helvetica-Bold", 20)
    c.drawString(2.2 * inch, y_position, "Sentiment Analysis Report")

    # Update vertical position after title
    y_position -= 2 * inch

    # Add pie chart with width 5 inches and height double the width
    pie_chart_width = 5 * inch
    pie_chart_height = 4 * inch
    c.drawImage(pie_chart_path, 1.5 * inch, y_position - pie_chart_height, width=pie_chart_width, height=pie_chart_height)

    # Update vertical position after pie chart
    y_position -= (pie_chart_height + 1 * inch)  # Add some spacing

    # Add Positive Keywords heading
    c.setFont("Helvetica-Bold", 12)
    c.drawString(3 * inch, y_position, "Positive Keywords")

    # Add positive word cloud
    c.drawImage(pos_wordcloud_path, 1 * inch, y_position - 3.3 * inch, width=6 * inch, height=3 * inch)  # 2:1 ratio

    # Update vertical position after positive word cloud
    y_position -= (3 * inch + 1 * inch)  # Add some spacing

    # Add Negative Keywords heading
    c.setFont("Helvetica-Bold", 12)
    c.drawString(3 * inch, y_position, "Negative Keywords")

    # Add negative word cloud
    c.drawImage(neg_wordcloud_path, 1 * inch, y_position - 3.3 * inch, width=6 * inch, height=3 * inch)  # 2:1 ratio

    c.save()
    pdf_output.seek(0)

    return pdf_output


# Streamlit UI
st.title("Sentiment Analysis and Reporting")

# Initialize session state for button visibility
if 'show_pdf_download' not in st.session_state:
    st.session_state.show_pdf_download = False

# Sidebar for encoding detection and reset button
st.sidebar.header("File Encoding Checker")

# File uploader in the sidebar
uploaded_file = st.sidebar.file_uploader("Upload CSV file for Encoding Check", type=["csv"])

if uploaded_file:
    # Detect the encoding
    encoding = detect_encoding(uploaded_file)
    st.sidebar.write(f"Detected encoding: {encoding}")

# Reset button in the sidebar
if st.sidebar.button("Reset Analysis"):
    if os.path.exists("sentiment_pie_chart.png"):
        os.remove("sentiment_pie_chart.png")
    if os.path.exists("pos_wordcloud.png"):
        os.remove("pos_wordcloud.png")
    if os.path.exists("neg_wordcloud.png"):
        os.remove("neg_wordcloud.png")
    st.sidebar.write("Files deleted. Please re-upload a file to start over.")

# File uploader for sentiment analysis
uploaded_file = st.file_uploader("Upload CSV file for Sentiment Analysis", type=["csv"])

# Dropdown for encoding specification in the main panel
encodings = ['utf-8', 'latin-1', 'ISO-8859-1', 'ASCII', 'UTF-16', 'UTF-32', 'ANSI', "Windows-1251", 'Windows-1252']
user_encoding = st.selectbox("Select Encoding", options=encodings, index=0)

# Button to start processing
if st.button("Go"):
    if uploaded_file:
        try:
            # Load the CSV file into DataFrame with specified encoding
            uploaded_file.seek(0)  # Reset the file pointer to the beginning
            df = pd.read_csv(uploaded_file, encoding=user_encoding)
        except UnicodeDecodeError:
            st.error("Error decoding the file. Please specify the correct encoding.")
        else:
            # Check if the DataFrame has exactly one column
            if df.shape[1] != 1:
                st.warning("The CSV file should only contain one column with review data.")
            else:
                # Rename the column to 'review'
                df.columns = ['review']

                # Clean up the DataFrame
                df['review'] = df['review'].astype(str).str.strip()
                df = df[df['review'].apply(len) <= 512]

                # Apply sentiment analysis
                df['sentiment'] = df['review'].apply(analyze_sentiment)
                df['sentiment_label'] = df['sentiment'].apply(lambda x: x['label'])
                df['sentiment_score'] = df['sentiment'].apply(lambda x: x['score'])

                # Drop the original 'sentiment' column
                df = df.drop(columns=['sentiment'])

                # Pie chart data
                sentiment_counts = df['sentiment_label'].value_counts()

                # Create pie chart
                fig, ax = plt.subplots()
                ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=45)
                ax.set_title('Distribution of Sentiment')
                pie_chart_path = "sentiment_pie_chart.png"
                plt.savefig(pie_chart_path)

                # Create word clouds
                stopwords = set(STOPWORDS)

                pos_reviews = df[df['sentiment_label'] == 'POSITIVE']['review'].str.cat(sep=' ')
                neg_reviews = df[df['sentiment_label'] == 'NEGATIVE']['review'].str.cat(sep=' ')

                pos_wordcloud = WordCloud(max_font_size=80, max_words=10, background_color='white', stopwords=stopwords).generate(pos_reviews)
                neg_wordcloud = WordCloud(max_font_size=80, max_words=10, background_color='white', stopwords=stopwords).generate(neg_reviews)

                # Save word clouds to files
                pos_wordcloud_path = "pos_wordcloud.png"
                neg_wordcloud_path = "neg_wordcloud.png"
                pos_wordcloud.to_file(pos_wordcloud_path)
                neg_wordcloud.to_file(neg_wordcloud_path)

                # Create PDF
                pdf_output = generate_pdf(pie_chart_path, pos_wordcloud_path, neg_wordcloud_path)

                # Display options
                st.write("Processing complete!")

                # Update session state to show the appropriate buttons
                st.session_state.show_pdf_download = True

                # Display buttons
                download_pdf = st.download_button("Download PDF Report", pdf_output, file_name="sentiment_analysis_report.pdf", mime="application/pdf")
    else:
        st.info("Please upload a CSV file to get started.")