Cosine Sim Calc now Working properly

This commit is contained in:
Mann Patel
2025-04-03 11:59:25 -06:00
parent e7580c36f5
commit 99f12319d5
5 changed files with 152 additions and 887 deletions

View File

@@ -1,272 +0,0 @@
from flask import Flask, request, jsonify
from flask_cors import CORS
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import mysql.connector as db_con
# Flask app initialization
app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": "*"}}, supports_credentials=True)
# Database connection setup
def get_db_connection():
return db_con.connect(
host="localhost",
port=3306,
user="root",
database="Marketplace"
)
# Fetch all products with category names
def get_all_products():
query = """
SELECT p.ProductID, p.Name, p.Description, c.Name AS Category
FROM Product p
JOIN Category c ON p.CategoryID = c.CategoryID
"""
try:
connection = get_db_connection()
cursor = connection.cursor(dictionary=True)
cursor.execute(query)
products = cursor.fetchall()
cursor.close()
connection.close()
return products
except Exception as e:
print(f"Database error getting products: {e}")
return []
# Fetch user history
def get_user_history(user_id):
query = """
SELECT p.ProductID, p.Name, p.Description, c.Name AS Category
FROM History h
JOIN Product p ON h.ProductID = p.ProductID
JOIN Category c ON p.CategoryID = c.CategoryID
WHERE h.UserID = %s
"""
try:
connection = get_db_connection()
cursor = connection.cursor(dictionary=True)
cursor.execute(query, (user_id,))
history = cursor.fetchall()
cursor.close()
connection.close()
return history
except Exception as e:
print(f"Error getting user history: {e}")
return []
# Store recommendations
def store_user_recommendations(user_id, recommendations):
delete_query = "DELETE FROM Recommendation WHERE UserID = %s"
insert_query = "INSERT INTO Recommendation (UserID, RecommendedProductID) VALUES (%s, %s)"
try:
connection = get_db_connection()
cursor = connection.cursor()
# First delete existing recommendations
cursor.execute(delete_query, (user_id,))
# Then insert new recommendations
for product_id in recommendations:
cursor.execute(insert_query, (user_id, product_id))
connection.commit()
cursor.close()
connection.close()
return True
except Exception as e:
print(f"Error storing recommendations: {e}")
return False
# Fetch stored recommendations
def get_stored_recommendations(user_id):
query = """
SELECT p.ProductID, p.Name, p.Description, c.Name AS Category
FROM Recommendation r
JOIN Product p ON r.RecommendedProductID = p.ProductID
JOIN Category c ON p.CategoryID = c.CategoryID
WHERE r.UserID = %s
"""
try:
connection = get_db_connection()
cursor = connection.cursor(dictionary=True)
cursor.execute(query, (user_id,))
recommendations = cursor.fetchall()
cursor.close()
connection.close()
return recommendations
except Exception as e:
print(f"Error getting stored recommendations: {e}")
return []
# Initialize Recommender class
class Recommender:
def __init__(self):
self.products_df = None
self.tfidf_matrix = None
self.tfidf_vectorizer = None
self.product_indices = None
def load_products(self, products_data):
self.products_df = pd.DataFrame(products_data)
# Combine relevant features for content-based filtering
self.products_df['content'] = (
self.products_df['Category'] + ' ' +
self.products_df['Name'] + ' ' +
self.products_df['Description'].fillna('')
)
# Create TF-IDF matrix
self.tfidf_vectorizer = TfidfVectorizer(
stop_words='english',
max_features=5000,
ngram_range=(1, 2)
)
self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.products_df['content'])
# Map product IDs to indices for quick lookup
self.product_indices = pd.Series(
self.products_df.index,
index=self.products_df['ProductID']
).drop_duplicates()
def recommend_products_for_user(self, user_id, top_n=40):
"""
Generate product recommendations based on user history using cosine similarity
"""
# Get user history
user_history = get_user_history(user_id)
# If no history, return popular products
if not user_history:
# In a real system, you might return popular products here
return self.recommend_popular_products(top_n)
# Convert user history to DataFrame
history_df = pd.DataFrame(user_history)
# Get indices of products in user history
history_indices = []
for product_id in history_df['ProductID']:
if product_id in self.product_indices:
history_indices.append(self.product_indices[product_id])
if not history_indices:
return self.recommend_popular_products(top_n)
# Get TF-IDF vectors for user's history
user_profile = self.tfidf_matrix[history_indices].mean(axis=0).reshape(1, -1)
# Calculate similarity scores
similarity_scores = cosine_similarity(user_profile, self.tfidf_matrix)
similarity_scores = similarity_scores.flatten()
# Create a Series with product indices and similarity scores
product_scores = pd.Series(similarity_scores, index=self.products_df.index)
# Remove products the user has already interacted with
product_scores = product_scores.drop(history_indices)
# Sort by similarity score (highest first)
product_scores = product_scores.sort_values(ascending=False)
# Get top N product indices
top_indices = product_scores.iloc[:top_n].index
# Get product IDs for these indices
recommended_product_ids = self.products_df.iloc[top_indices]['ProductID'].tolist()
return recommended_product_ids
def recommend_popular_products(self, n=40):
"""
Fallback recommendation strategy when user has no history
In a real system, this would use actual popularity metrics
"""
# For now, just returning random products
return self.products_df.sample(min(n, len(self.products_df)))['ProductID'].tolist()
# Create recommender instance
recommender = Recommender()
@app.route('/load_products', methods=['GET'])
def load_products():
products = get_all_products()
if not products:
return jsonify({'error': 'Failed to load products from database'}), 500
recommender.load_products(products)
return jsonify({'message': 'Products loaded successfully', 'count': len(products)})
@app.route('/recommend/<int:user_id>', methods=['GET'])
def recommend(user_id):
# Check if products are loaded
if recommender.products_df is None:
products = get_all_products()
if not products:
return jsonify({'error': 'No products available'}), 500
recommender.load_products(products)
# Generate recommendations using cosine similarity
recommendations = recommender.recommend_products_for_user(user_id)
# Store recommendations in database
if store_user_recommendations(user_id, recommendations):
return jsonify({
'userId': user_id,
'recommendations': recommendations,
'count': len(recommendations)
})
else:
return jsonify({'error': 'Failed to store recommendations'}), 500
@app.route('/api/user/session', methods=['POST'])
def handle_session_data():
try:
data = request.get_json()
print("Received data:", data) # Debug print
user_id = data.get('userId')
email = data.get('email')
is_authenticated = data.get('isAuthenticated')
if not user_id or not email or is_authenticated is None:
print("Missing required fields") # Debug print
return jsonify({'error': 'Invalid data'}), 400
print(f"Processing session data: User ID: {user_id}, Email: {email}, Authenticated: {is_authenticated}")
# Test database connection first
try:
conn = get_db_connection()
conn.close()
print("Database connection successful")
except Exception as db_err:
print(f"Database connection error: {db_err}")
return jsonify({'error': f'Database connection error: {str(db_err)}'}), 500
# Continue with the rest of your code...
except Exception as e:
import traceback
print(f"Error in handle_session_data: {e}")
print(traceback.format_exc()) # Print full stack trace
return jsonify({'error': f'Server error: {str(e)}'}), 500
if __name__ == '__main__':
# Load products on startup
products = get_all_products()
if products:
recommender.load_products(products)
print(f"Loaded {len(products)} products at startup")
else:
print("Warning: No products loaded at startup")
app.run(debug=True, host='0.0.0.0', port=5000)

View File

@@ -1,5 +0,0 @@
select *
from Product
Where ProductID in (select ProductID
from History
where UserID=1);

View File

@@ -1,221 +1,113 @@
# pip install mysql.connector
import pandas as pd
import numpy as np
import mysql.connector
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import logging
'''
Recommender system using content-based filtering
'''
class Recommender:
def __init__(self):
# Initialize data structures
self.products_df = None
self.user_profiles = {}
self.tfidf_matrix = None
self.tfidf_vectorizer = None
self.product_indices = None
def load_products(self, products_data):
def database():
db_connection = mysql.connector.connect(
host = "localhost",
port = "3306",
user = "root",
database = "Marketplace"
)
return db_connection
def get_all_products():
db_con = database()
cursor = db_con.cursor()
cursor.execute("SELECT CategoryID FROM Category")
categories = cursor.fetchall()
select_clause = "SELECT p.ProductID"
for category in categories:
category_id = category[0]
select_clause += f", MAX(CASE WHEN pc.CategoryID = {category_id} THEN 1 ELSE 0 END) AS `Cat_{category_id}`"
final_query = f"""
{select_clause}
FROM Product p
LEFT JOIN Product_Category pc ON p.ProductID = pc.ProductID
LEFT JOIN Category c ON pc.CategoryID = c.CategoryID
GROUP BY p.ProductID;
"""
Load product data into the recommender system
products_data: list of dictionaries with product info (id, name, description, category, etc.)
cursor.execute(final_query)
results = cursor.fetchall()
final = []
for row in results:
text_list = list(row)
text_list.pop(0)
final.append(text_list)
cursor.close()
db_con.close()
return final
def get_user_history(user_id):
db_con = database()
cursor = db_con.cursor()
cursor.execute("SELECT CategoryID FROM Category")
categories = cursor.fetchall()
select_clause = "SELECT p.ProductID"
for category in categories:
category_id = category[0] # get the uid of the catefory and then append that to the new column
select_clause += f", MAX(CASE WHEN pc.CategoryID = {category_id} THEN 1 ELSE 0 END) AS `Cat_{category_id}`"
final_query = f"""
{select_clause}
FROM Product p
LEFT JOIN Product_Category pc ON p.ProductID = pc.ProductID
LEFT JOIN Category c ON pc.CategoryID = c.CategoryID
where p.ProductID in (select ProductID from History where UserID = {user_id})
GROUP BY p.ProductID;
"""
self.products_df = pd.DataFrame(products_data)
# Create a text representation for each product (combining various features)
self.products_df['content'] = (
self.products_df['category'] + ' ' +
self.products_df['name'] + ' ' +
self.products_df['description']
)
# Initialize TF-IDF vectorizer to convert text to vectors
self.tfidf_vectorizer = TfidfVectorizer(
stop_words='english',
max_features=5000, # Limit features to avoid sparse matrices
ngram_range=(1, 2) # Use both unigrams and bigrams
)
# Compute TF-IDF matrix
self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.products_df['content'])
# Create a mapping from product_id to index
self.product_indices = pd.Series(
self.products_df.index,
index=self.products_df['product_id']
).drop_duplicates()
def track_user_click(self, user_id, product_id):
"""
Track user clicks on products to build user profiles
"""
if user_id not in self.user_profiles:
self.user_profiles[user_id] = {
'clicks': {},
'category_weights': {
'electronics': 0,
'school supplies': 0,
'rental place': 0,
'furniture': 0
}
}
# Get the clicked product's index and details
if product_id in self.product_indices:
product_idx = self.product_indices[product_id]
product_category = self.products_df.iloc[product_idx]['category']
# Update click count
if product_id in self.user_profiles[user_id]['clicks']:
self.user_profiles[user_id]['clicks'][product_id] += 1
else:
self.user_profiles[user_id]['clicks'][product_id] = 1
# Update category weight
self.user_profiles[user_id]['category_weights'][product_category] += 1
def get_user_profile_vector(self, user_id):
"""
Generate a user profile vector based on their click history
"""
if user_id not in self.user_profiles or not self.user_profiles[user_id]['clicks']:
# Return a zero vector if no click history
return np.zeros((1, self.tfidf_matrix.shape[1]))
# Create a weighted average of all clicked products' TF-IDF vectors
clicked_product_vectors = []
weights = []
for product_id, click_count in self.user_profiles[user_id]['clicks'].items():
if product_id in self.product_indices:
product_idx = self.product_indices[product_id]
product_category = self.products_df.iloc[product_idx]['category']
category_weight = self.user_profiles[user_id]['category_weights'][product_category]
# Weight is based on both click count and category preference
weight = click_count * (1 + 0.5 * category_weight)
weights.append(weight)
clicked_product_vectors.append(self.tfidf_matrix[product_idx])
# Normalize weights
weights = np.array(weights) / np.sum(weights)
# Compute weighted average
user_profile = np.zeros((1, self.tfidf_matrix.shape[1]))
for i, vector in enumerate(clicked_product_vectors):
user_profile += weights[i] * vector.toarray()
return user_profile
def recommend_products(self, user_id, n=5, category_filter=None):
"""
Recommend products to a user based on their profile
user_id: ID of the user
n: Number of recommendations to return
category_filter: Optional filter to limit recommendations to a specific category
"""
# Get user profile vector
user_profile = self.get_user_profile_vector(user_id)
# If user has no profile, recommend popular products (not implemented)
if np.sum(user_profile) == 0:
return self._get_popular_products(n, category_filter)
# Calculate similarity scores
sim_scores = cosine_similarity(user_profile, self.tfidf_matrix)
sim_scores = sim_scores.flatten()
# Create a DataFrame for easier filtering
recommendations_df = pd.DataFrame({
'product_id': self.products_df['product_id'],
'score': sim_scores,
'category': self.products_df['category']
})
# Filter out products that the user has already clicked on
if user_id in self.user_profiles and self.user_profiles[user_id]['clicks']:
clicked_products = list(self.user_profiles[user_id]['clicks'].keys())
recommendations_df = recommendations_df[~recommendations_df['product_id'].isin(clicked_products)]
# Apply category filter if provided
if category_filter:
recommendations_df = recommendations_df[recommendations_df['category'] == category_filter]
# Sort by similarity score and get top n recommendations
recommendations_df = recommendations_df.sort_values('score', ascending=False).head(n)
cursor.execute(final_query)
results = cursor.fetchall()
final = []
for row in results:
text_list = list(row)
text_list.pop(0)
final.append(text_list)
cursor.close()
db_con.close()
return final
def get_recommendations(user_id, top_n=40):
try:
# Get all products and user history with their category vectors
all_products = get_all_products()
user_history = get_user_history(user_id)
# if not user_history:
# # Cold start: return popular products
# return get_popular_products(top_n)
# Calculate similarity between all products and user history
user_profile = np.mean(user_history, axis=0) # Average user preferences
similarities = cosine_similarity([user_profile], all_products)
# finds the indices of the top N products that have the highest
# cosine similarity with the user's profile and sorted from most similar to least similar.
product_indices = similarities[0].argsort()[-top_n:][::-1]
print("product", product_indices)
# Return recommended product IDs
return recommendations_df['product_id'].tolist()
def _get_popular_products(self, n=5, category_filter=None):
"""
Return popular products when a user has no profile
(This would typically be implemented with actual popularity metrics)
"""
filtered_df = self.products_df
if category_filter:
filtered_df = filtered_df[filtered_df['category'] == category_filter]
# Just return random products for now (in a real system you'd use popularity metrics)
if len(filtered_df) >= n:
return filtered_df.sample(n)['product_id'].tolist()
else:
return filtered_df['product_id'].tolist()
def recommend_by_category_preference(self, user_id, n=5):
"""
Recommend products based primarily on the user's category preferences
"""
if user_id not in self.user_profiles:
return self._get_popular_products(n)
# Get the user's most clicked category
category_weights = self.user_profiles[user_id]['category_weights']
# If no category has been clicked, return popular products
if sum(category_weights.values()) == 0:
return self._get_popular_products(n)
# Sort categories by number of clicks
sorted_categories = sorted(
category_weights.items(),
key=lambda x: x[1],
reverse=True
)
recommendations = []
remaining = n
# Allocate recommendations proportionally across categories
for category, weight in sorted_categories:
if weight > 0:
# Allocate recommendations proportionally to category weight
category_allocation = max(1, int(remaining * (weight / sum(category_weights.values()))))
if category_allocation > remaining:
category_allocation = remaining
# Get recommendations for this category
category_recs = self.recommend_products(user_id, category_allocation, category)
recommendations.extend(category_recs)
# Update remaining slots
remaining -= len(category_recs)
if remaining <= 0:
break
# If we still have slots to fill, add general recommendations
if remaining > 0:
general_recs = self.recommend_products(user_id, remaining)
# Filter out duplicates
general_recs = [rec for rec in general_recs if rec not in recommendations]
recommendations.extend(general_recs[:remaining])
return recommendations
return [all_products[i][0] for i in product_indices] # Product IDs
except Exception as e:
logging.error(f"Recommendation error for user {user_id}: {str(e)}")
# return get_popular_products(top_n) # Fallback to popular products
exported = Recommender()
get_recommendations(1)

View File

@@ -1,33 +0,0 @@
import mysql.connector as db_con
#TODO: Specify all the required queries
query_get_all_Prod= ("SELECT * FROM Product ")
#TODO: connect with the db
def database():
db = db_con.connect(
host = "localhost",
port = 3306,
user = "root",
database = "Marketplace"
)
cursor = db.cursor()
cursor.execute(query_get_all_Prod)
data = [None]
for item in cursor:
data.append(item)
# print(item)
print(data[1])
cursor.close()
db.close()
#TODO: Get All products
# Make it into a dictionary with product id and the list of category it would have
# {Prod1:[1,0,0,0,1]} this could mean its a [elctronics, 0,0,0, kitchen]
database()