Cosine Sim Calc now Working properly

2025-04-03 11:59:25 -06:00
parent e7580c36f5
commit 99f12319d5
5 changed files with 152 additions and 887 deletions
--- a/recommondation-engine/example.py
+++ b/recommondation-engine/example.py
@@ -1,272 +0,0 @@
-from flask import Flask, request, jsonify
-from flask_cors import CORS
-import pandas as pd
-import numpy as np
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-import mysql.connector as db_con
-
-# Flask app initialization
-app = Flask(__name__)
-CORS(app, resources={r"/*": {"origins": "*"}}, supports_credentials=True)
-
-# Database connection setup
-def get_db_connection():
-    return db_con.connect(
-        host="localhost",
-        port=3306,
-        user="root",
-        database="Marketplace"
-    )
-
-# Fetch all products with category names
-def get_all_products():
-    query = """
-        SELECT p.ProductID, p.Name, p.Description, c.Name AS Category
-        FROM Product p
-        JOIN Category c ON p.CategoryID = c.CategoryID
-    """
-
-    try:
-        connection = get_db_connection()
-        cursor = connection.cursor(dictionary=True)
-        cursor.execute(query)
-        products = cursor.fetchall()
-        cursor.close()
-        connection.close()
-        return products
-    except Exception as e:
-        print(f"Database error getting products: {e}")
-        return []
-
-# Fetch user history
-def get_user_history(user_id):
-    query = """
-        SELECT p.ProductID, p.Name, p.Description, c.Name AS Category
-        FROM History h
-        JOIN Product p ON h.ProductID = p.ProductID
-        JOIN Category c ON p.CategoryID = c.CategoryID
-        WHERE h.UserID = %s
-    """
-    try:
-        connection = get_db_connection()
-        cursor = connection.cursor(dictionary=True)
-        cursor.execute(query, (user_id,))
-        history = cursor.fetchall()
-        cursor.close()
-        connection.close()
-        return history
-    except Exception as e:
-        print(f"Error getting user history: {e}")
-        return []
-
-# Store recommendations
-def store_user_recommendations(user_id, recommendations):
-    delete_query = "DELETE FROM Recommendation WHERE UserID = %s"
-    insert_query = "INSERT INTO Recommendation (UserID, RecommendedProductID) VALUES (%s, %s)"
-
-    try:
-        connection = get_db_connection()
-        cursor = connection.cursor()
-
-        # First delete existing recommendations
-        cursor.execute(delete_query, (user_id,))
-
-        # Then insert new recommendations
-        for product_id in recommendations:
-            cursor.execute(insert_query, (user_id, product_id))
-
-        connection.commit()
-        cursor.close()
-        connection.close()
-        return True
-    except Exception as e:
-        print(f"Error storing recommendations: {e}")
-        return False
-
-# Fetch stored recommendations
-def get_stored_recommendations(user_id):
-    query = """
-        SELECT p.ProductID, p.Name, p.Description, c.Name AS Category
-        FROM Recommendation r
-        JOIN Product p ON r.RecommendedProductID = p.ProductID
-        JOIN Category c ON p.CategoryID = c.CategoryID
-        WHERE r.UserID = %s
-    """
-    try:
-        connection = get_db_connection()
-        cursor = connection.cursor(dictionary=True)
-        cursor.execute(query, (user_id,))
-        recommendations = cursor.fetchall()
-        cursor.close()
-        connection.close()
-        return recommendations
-    except Exception as e:
-        print(f"Error getting stored recommendations: {e}")
-        return []
-
-# Initialize Recommender class
-class Recommender:
-    def __init__(self):
-        self.products_df = None
-        self.tfidf_matrix = None
-        self.tfidf_vectorizer = None
-        self.product_indices = None
-
-    def load_products(self, products_data):
-        self.products_df = pd.DataFrame(products_data)
-
-        # Combine relevant features for content-based filtering
-        self.products_df['content'] = (
-            self.products_df['Category'] + ' ' +
-            self.products_df['Name'] + ' ' +
-            self.products_df['Description'].fillna('')
-        )
-
-        # Create TF-IDF matrix
-        self.tfidf_vectorizer = TfidfVectorizer(
-            stop_words='english',
-            max_features=5000,
-            ngram_range=(1, 2)
-        )
-        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.products_df['content'])
-
-        # Map product IDs to indices for quick lookup
-        self.product_indices = pd.Series(
-            self.products_df.index,
-            index=self.products_df['ProductID']
-        ).drop_duplicates()
-
-    def recommend_products_for_user(self, user_id, top_n=40):
-        """
-        Generate product recommendations based on user history using cosine similarity
-        """
-        # Get user history
-        user_history = get_user_history(user_id)
-
-        # If no history, return popular products
-        if not user_history:
-            # In a real system, you might return popular products here
-            return self.recommend_popular_products(top_n)
-
-        # Convert user history to DataFrame
-        history_df = pd.DataFrame(user_history)
-
-        # Get indices of products in user history
-        history_indices = []
-        for product_id in history_df['ProductID']:
-            if product_id in self.product_indices:
-                history_indices.append(self.product_indices[product_id])
-
-        if not history_indices:
-            return self.recommend_popular_products(top_n)
-
-        # Get TF-IDF vectors for user's history
-        user_profile = self.tfidf_matrix[history_indices].mean(axis=0).reshape(1, -1)
-
-        # Calculate similarity scores
-        similarity_scores = cosine_similarity(user_profile, self.tfidf_matrix)
-        similarity_scores = similarity_scores.flatten()
-
-        # Create a Series with product indices and similarity scores
-        product_scores = pd.Series(similarity_scores, index=self.products_df.index)
-
-        # Remove products the user has already interacted with
-        product_scores = product_scores.drop(history_indices)
-
-        # Sort by similarity score (highest first)
-        product_scores = product_scores.sort_values(ascending=False)
-
-        # Get top N product indices
-        top_indices = product_scores.iloc[:top_n].index
-
-        # Get product IDs for these indices
-        recommended_product_ids = self.products_df.iloc[top_indices]['ProductID'].tolist()
-
-        return recommended_product_ids
-
-    def recommend_popular_products(self, n=40):
-        """
-        Fallback recommendation strategy when user has no history
-        In a real system, this would use actual popularity metrics
-        """
-        # For now, just returning random products
-        return self.products_df.sample(min(n, len(self.products_df)))['ProductID'].tolist()
-
-# Create recommender instance
-recommender = Recommender()
-
-@app.route('/load_products', methods=['GET'])
-def load_products():
-    products = get_all_products()
-    if not products:
-        return jsonify({'error': 'Failed to load products from database'}), 500
-
-    recommender.load_products(products)
-    return jsonify({'message': 'Products loaded successfully', 'count': len(products)})
-
-@app.route('/recommend/<int:user_id>', methods=['GET'])
-def recommend(user_id):
-    # Check if products are loaded
-    if recommender.products_df is None:
-        products = get_all_products()
-        if not products:
-            return jsonify({'error': 'No products available'}), 500
-        recommender.load_products(products)
-
-    # Generate recommendations using cosine similarity
-    recommendations = recommender.recommend_products_for_user(user_id)
-
-    # Store recommendations in database
-    if store_user_recommendations(user_id, recommendations):
-        return jsonify({
-            'userId': user_id,
-            'recommendations': recommendations,
-            'count': len(recommendations)
-        })
-    else:
-        return jsonify({'error': 'Failed to store recommendations'}), 500
-
-@app.route('/api/user/session', methods=['POST'])
-def handle_session_data():
-    try:
-        data = request.get_json()
-        print("Received data:", data)  # Debug print
-
-        user_id = data.get('userId')
-        email = data.get('email')
-        is_authenticated = data.get('isAuthenticated')
-
-        if not user_id or not email or is_authenticated is None:
-            print("Missing required fields")  # Debug print
-            return jsonify({'error': 'Invalid data'}), 400
-
-        print(f"Processing session data: User ID: {user_id}, Email: {email}, Authenticated: {is_authenticated}")
-
-        # Test database connection first
-        try:
-            conn = get_db_connection()
-            conn.close()
-            print("Database connection successful")
-        except Exception as db_err:
-            print(f"Database connection error: {db_err}")
-            return jsonify({'error': f'Database connection error: {str(db_err)}'}), 500
-
-        # Continue with the rest of your code...
-
-    except Exception as e:
-        import traceback
-        print(f"Error in handle_session_data: {e}")
-        print(traceback.format_exc())  # Print full stack trace
-        return jsonify({'error': f'Server error: {str(e)}'}), 500
-
-if __name__ == '__main__':
-    # Load products on startup
-    products = get_all_products()
-    if products:
-        recommender.load_products(products)
-        print(f"Loaded {len(products)} products at startup")
-    else:
-        print("Warning: No products loaded at startup")
-
-    app.run(debug=True, host='0.0.0.0', port=5000)
--- a/recommondation-engine/example.sql
+++ b/recommondation-engine/example.sql
@@ -1,5 +0,0 @@
-select *
-from Product 
-Where ProductID in (select ProductID
-                    from History
-                    where UserID=1);
--- a/recommondation-engine/example1.py
+++ b/recommondation-engine/example1.py
@@ -1,221 +1,113 @@
+# pip install mysql.connector

-import pandas as pd
-import numpy as np
+import mysql.connector
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import logging

-'''
-Recommender system using content-based filtering
-'''
-class Recommender:
-    def __init__(self):
-        # Initialize data structures
-        self.products_df = None
-        self.user_profiles = {}
-        self.tfidf_matrix = None
-        self.tfidf_vectorizer = None
-        self.product_indices = None
-        
-    def load_products(self, products_data):
+def database():
+    db_connection = mysql.connector.connect(
+        host = "localhost",
+        port = "3306",
+        user = "root",
+        database = "Marketplace"
+    )
+    return db_connection
+
+
+def get_all_products():
+
+    db_con = database()
+    cursor = db_con.cursor()
+
+    cursor.execute("SELECT CategoryID FROM Category")
+    categories = cursor.fetchall()
+
+    select_clause = "SELECT p.ProductID"
+    for category in categories:
+        category_id = category[0]
+        select_clause += f", MAX(CASE WHEN pc.CategoryID = {category_id} THEN 1 ELSE 0 END) AS `Cat_{category_id}`"
+
+    final_query = f"""
+        {select_clause}
+        FROM Product p
+        LEFT JOIN Product_Category pc ON p.ProductID = pc.ProductID
+        LEFT JOIN Category c ON pc.CategoryID = c.CategoryID
+        GROUP BY p.ProductID;
        """
-        Load product data into the recommender system
-        
-        products_data: list of dictionaries with product info (id, name, description, category, etc.)
+
+    cursor.execute(final_query)
+    results = cursor.fetchall()
+
+    final = []
+    for row in results:
+        text_list = list(row)
+        text_list.pop(0)
+        final.append(text_list)
+
+    cursor.close()
+    db_con.close()
+    return final
+
+def get_user_history(user_id):
+    db_con = database()
+    cursor = db_con.cursor()
+
+    cursor.execute("SELECT CategoryID FROM Category")
+    categories = cursor.fetchall()
+
+    select_clause = "SELECT p.ProductID"
+    for category in categories:
+        category_id = category[0] # get the uid of the catefory and then append that to the new column
+        select_clause += f", MAX(CASE WHEN pc.CategoryID = {category_id} THEN 1 ELSE 0 END) AS `Cat_{category_id}`"
+
+    final_query = f"""
+        {select_clause}
+        FROM Product p
+        LEFT JOIN Product_Category pc ON p.ProductID = pc.ProductID
+        LEFT JOIN Category c ON pc.CategoryID = c.CategoryID
+        where p.ProductID in (select ProductID from History where UserID = {user_id})
+        GROUP BY p.ProductID;
        """
-        self.products_df = pd.DataFrame(products_data)
-        
-        # Create a text representation for each product (combining various features)
-        self.products_df['content'] = (
-            self.products_df['category'] + ' ' + 
-            self.products_df['name'] + ' ' + 
-            self.products_df['description']
-        )
-        
-        # Initialize TF-IDF vectorizer to convert text to vectors
-        self.tfidf_vectorizer = TfidfVectorizer(
-            stop_words='english',
-            max_features=5000,  # Limit features to avoid sparse matrices
-            ngram_range=(1, 2)  # Use both unigrams and bigrams
-        )
-        
-        # Compute TF-IDF matrix
-        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.products_df['content'])
-        
-        # Create a mapping from product_id to index
-        self.product_indices = pd.Series(
-            self.products_df.index, 
-            index=self.products_df['product_id']
-        ).drop_duplicates()
-        
-    def track_user_click(self, user_id, product_id):
-        """
-        Track user clicks on products to build user profiles
-        """
-        if user_id not in self.user_profiles:
-            self.user_profiles[user_id] = {
-                'clicks': {},
-                'category_weights': {
-                    'electronics': 0,
-                    'school supplies': 0,
-                    'rental place': 0,
-                    'furniture': 0
-                }
-            }
-        
-        # Get the clicked product's index and details
-        if product_id in self.product_indices:
-            product_idx = self.product_indices[product_id]
-            product_category = self.products_df.iloc[product_idx]['category']
-            
-            # Update click count
-            if product_id in self.user_profiles[user_id]['clicks']:
-                self.user_profiles[user_id]['clicks'][product_id] += 1
-            else:
-                self.user_profiles[user_id]['clicks'][product_id] = 1
-            
-            # Update category weight
-            self.user_profiles[user_id]['category_weights'][product_category] += 1
-    
-    def get_user_profile_vector(self, user_id):
-        """
-        Generate a user profile vector based on their click history
-        """
-        if user_id not in self.user_profiles or not self.user_profiles[user_id]['clicks']:
-            # Return a zero vector if no click history
-            return np.zeros((1, self.tfidf_matrix.shape[1]))
-        
-        # Create a weighted average of all clicked products' TF-IDF vectors
-        clicked_product_vectors = []
-        weights = []
-        
-        for product_id, click_count in self.user_profiles[user_id]['clicks'].items():
-            if product_id in self.product_indices:
-                product_idx = self.product_indices[product_id]
-                product_category = self.products_df.iloc[product_idx]['category']
-                category_weight = self.user_profiles[user_id]['category_weights'][product_category]
-                
-                # Weight is based on both click count and category preference
-                weight = click_count * (1 + 0.5 * category_weight)
-                weights.append(weight)
-                clicked_product_vectors.append(self.tfidf_matrix[product_idx])
-        
-        # Normalize weights
-        weights = np.array(weights) / np.sum(weights)
-        
-        # Compute weighted average
-        user_profile = np.zeros((1, self.tfidf_matrix.shape[1]))
-        for i, vector in enumerate(clicked_product_vectors):
-            user_profile += weights[i] * vector.toarray()
-            
-        return user_profile
-    
-    def recommend_products(self, user_id, n=5, category_filter=None):
-        """
-        Recommend products to a user based on their profile
-        
-        user_id: ID of the user
-        n: Number of recommendations to return
-        category_filter: Optional filter to limit recommendations to a specific category
-        """
-        # Get user profile vector
-        user_profile = self.get_user_profile_vector(user_id)
-        
-        # If user has no profile, recommend popular products (not implemented)
-        if np.sum(user_profile) == 0:
-            return self._get_popular_products(n, category_filter)
-        
-        # Calculate similarity scores
-        sim_scores = cosine_similarity(user_profile, self.tfidf_matrix)
-        sim_scores = sim_scores.flatten()
-        
-        # Create a DataFrame for easier filtering
-        recommendations_df = pd.DataFrame({
-            'product_id': self.products_df['product_id'],
-            'score': sim_scores,
-            'category': self.products_df['category']
-        })
-        
-        # Filter out products that the user has already clicked on
-        if user_id in self.user_profiles and self.user_profiles[user_id]['clicks']:
-            clicked_products = list(self.user_profiles[user_id]['clicks'].keys())
-            recommendations_df = recommendations_df[~recommendations_df['product_id'].isin(clicked_products)]
-        
-        # Apply category filter if provided
-        if category_filter:
-            recommendations_df = recommendations_df[recommendations_df['category'] == category_filter]
-        
-        # Sort by similarity score and get top n recommendations
-        recommendations_df = recommendations_df.sort_values('score', ascending=False).head(n)
-        
+
+    cursor.execute(final_query)
+    results = cursor.fetchall()
+    final = []
+    for row in results:
+        text_list = list(row)
+        text_list.pop(0)
+        final.append(text_list)
+
+    cursor.close()
+    db_con.close()
+    return final
+
+
+def get_recommendations(user_id, top_n=40):
+    try:
+        # Get all products and user history with their category vectors
+        all_products = get_all_products()
+        user_history = get_user_history(user_id)
+
+        # if not user_history:
+        #     # Cold start: return popular products
+        #     return get_popular_products(top_n)
+
+        # Calculate similarity between all products and user history
+        user_profile = np.mean(user_history, axis=0)  # Average user preferences
+        similarities = cosine_similarity([user_profile], all_products)
+
+        # finds the indices of the top N products that have the highest
+        # cosine similarity with the user's profile and sorted from most similar to least similar.
+        product_indices = similarities[0].argsort()[-top_n:][::-1]
+        print("product", product_indices)
+
        # Return recommended product IDs
-        return recommendations_df['product_id'].tolist()
-    
-    def _get_popular_products(self, n=5, category_filter=None):
-        """
-        Return popular products when a user has no profile
-        (This would typically be implemented with actual popularity metrics)
-        """
-        filtered_df = self.products_df
-        
-        if category_filter:
-            filtered_df = filtered_df[filtered_df['category'] == category_filter]
-        
-        # Just return random products for now (in a real system you'd use popularity metrics)
-        if len(filtered_df) >= n:
-            return filtered_df.sample(n)['product_id'].tolist()
-        else:
-            return filtered_df['product_id'].tolist()
-
-    def recommend_by_category_preference(self, user_id, n=5):
-        """
-        Recommend products based primarily on the user's category preferences
-        """
-        if user_id not in self.user_profiles:
-            return self._get_popular_products(n)
-        
-        # Get the user's most clicked category
-        category_weights = self.user_profiles[user_id]['category_weights']
-        
-        # If no category has been clicked, return popular products
-        if sum(category_weights.values()) == 0:
-            return self._get_popular_products(n)
-        
-        # Sort categories by number of clicks
-        sorted_categories = sorted(
-            category_weights.items(),
-            key=lambda x: x[1],
-            reverse=True
-        )
-        
-        recommendations = []
-        remaining = n
-        
-        # Allocate recommendations proportionally across categories
-        for category, weight in sorted_categories:
-            if weight > 0:
-                # Allocate recommendations proportionally to category weight
-                category_allocation = max(1, int(remaining * (weight / sum(category_weights.values()))))
-                if category_allocation > remaining:
-                    category_allocation = remaining
-                
-                # Get recommendations for this category
-                category_recs = self.recommend_products(user_id, category_allocation, category)
-                recommendations.extend(category_recs)
-                
-                # Update remaining slots
-                remaining -= len(category_recs)
-                
-                if remaining <= 0:
-                    break
-        
-        # If we still have slots to fill, add general recommendations
-        if remaining > 0:
-            general_recs = self.recommend_products(user_id, remaining)
-            # Filter out duplicates
-            general_recs = [rec for rec in general_recs if rec not in recommendations]
-            recommendations.extend(general_recs[:remaining])
-        
-        return recommendations
+        return [all_products[i][0] for i in product_indices]  # Product IDs
+    except Exception as e:
+        logging.error(f"Recommendation error for user {user_id}: {str(e)}")
+        # return get_popular_products(top_n)  # Fallback to popular products


-exported = Recommender()
+get_recommendations(1)
--- a/recommondation-engine/test.py
+++ b/recommondation-engine/test.py
@@ -1,33 +0,0 @@
-import mysql.connector as db_con
-
-#TODO: Specify all the required queries
-query_get_all_Prod= ("SELECT * FROM Product ")
-
-
-#TODO: connect with the db
-def database():
-    db = db_con.connect(
-        host = "localhost",
-        port = 3306,
-        user = "root",
-        database = "Marketplace"
-    )
-
-    cursor = db.cursor()
-    cursor.execute(query_get_all_Prod)
-
-    data = [None]
-    for item in cursor:
-        data.append(item)
-        # print(item)
-
-    print(data[1])
-    cursor.close()
-    db.close()
-
-
-
-#TODO: Get All products
-# Make it into a dictionary with product id and the list of category it would have
-#  {Prod1:[1,0,0,0,1]} this could mean its a [elctronics, 0,0,0, kitchen]
-database()