NextGrocery / jaccard_similarity.py
student-abdullah's picture
Initial commit with full project
f8c0ae2
raw
history blame contribute delete
No virus
1.6 kB
import pandas as pd
from sklearn.metrics import jaccard_score
import numpy as np
from load_data import items
# Example of populating unique_items
unique_items = set(item for sublist in items for item in sublist)
# Create a product-item matrix
unique_items_list = list(unique_items)
product_item_matrix = pd.DataFrame(0, index=range(len(items)), columns=unique_items_list)
# Populate the product-item matrix
for i, transaction in enumerate(items):
for item in transaction:
product_item_matrix.loc[i, item] += 1
# Convert the DataFrame to a binary matrix
product_item_matrix_binary = product_item_matrix.map(lambda x: 1 if x > 0 else 0)
# Calculate Jacquard similarity
similarity_matrix_jaccard = pd.DataFrame(np.nan, index=product_item_matrix.columns, columns=product_item_matrix.columns)
for i in range(len(similarity_matrix_jaccard.columns)):
for j in range(len(similarity_matrix_jaccard.columns)):
similarity_matrix_jaccard.iloc[i, j] = jaccard_score(
product_item_matrix_binary.iloc[:, i],
product_item_matrix_binary.iloc[:, j]
)
# Function to get collaborative recommendations based on a product
def collaborative_recommendations(product_name, similarity_matrix=similarity_matrix_jaccard, threshold=0.01):
if product_name not in unique_items_list:
return f"Product '{product_name}' not found in the database."
similar_products = similarity_matrix[product_name][similarity_matrix[product_name] > threshold].sort_values(ascending=False).index.tolist()
return similar_products[:5] # top 5 most similar products