import time
import re
import json
import random
import logging
import asyncio
import os       
import shutil
from datetime import datetime, timedelta
from typing import Optional, List, Dict, Tuple
from urllib.parse import urlparse, urlencode
from uuid import UUID
from concurrent.futures import ThreadPoolExecutor
import requests

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from fastapi import HTTPException, status
import platform
import subprocess
from app.models.product import Product
from app.models.vendor import Vendor
from app.models.violation import Violation
from app.models.scraping_result import ScrapingResult

logger = logging.getLogger(__name__)

# Enhanced user agents for better evasion
GOOGLE_USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
]

# List of free/public proxies (use with caution, consider paid proxies for production)
FREE_PROXIES = [
    "socks4://103.146.170.233:5678",
    "socks4://103.165.64.86:4153",
    "http://108.165.152.59:80",
    "socks4://103.204.54.50:1080",
]

# HTTP Headers that mimic real browser requests
HTTP_HEADERS = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
    "Cache-Control": "max-age=0",
    "Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="122", "Google Chrome";v="122"',
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": '"Windows"',
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
}


def human_delay(min_seconds: float, max_seconds: float):
    """Introduce a random delay to mimic human behavior."""
    delay = random.uniform(min_seconds, max_seconds)
    time.sleep(delay)


def exponential_backoff_delay(attempt: int, base_delay: float = 2.0, max_delay: float = 60.0):
    """
    Calculate exponential backoff delay.
    Useful for retry logic when hitting rate limits.
    """
    delay = base_delay * (2 ** attempt)
    # Add jitter to prevent thundering herd
    jitter = random.uniform(0, delay * 0.1)
    final_delay = min(delay + jitter, max_delay)
    time.sleep(final_delay)


def get_free_proxy() -> Optional[str]:
    """
    Get a free proxy from environment or predefined list.
    For production, consider using a paid proxy service.
    """
    # Check if proxy is set via environment variable
    env_proxy = os.environ.get('HTTP_PROXY') or os.environ.get('http_proxy')
    if env_proxy:
        return env_proxy
    
    if FREE_PROXIES:
        proxy = random.choice(FREE_PROXIES)
        return proxy
    
    return None


def get_selenium_proxy_url(proxy: Optional[str]) -> Optional[str]:
    """Format proxy URL for Selenium."""
    if not proxy:
        return None
    if proxy.startswith("http://") or proxy.startswith("socks"):
        return proxy
    return f"http://{proxy}"


def search_tavily_api(query: str) -> List[str]:
    """
    Search using Tavily API (best for AI/automation).
    Tavily is designed for AI agents and is much more reliable than browser automation.
    """
    results = []
    
    try:
        logger.info("[Discovery] Using Tavily API for search")
        api_key = os.environ.get('TAVILY_API_KEY')
        if not api_key:
            logger.warning("[Discovery] TAVILY_API_KEY not configured - skipping Tavily search")
            return results
        
        url = "https://api.tavily.com/search"
        
        # Tavily API request
        payload = {
            "api_key": api_key,
            "query": query,
            "include_answer": False,
            "max_results": 5,
            "include_raw_content": False
        }
        
        response = requests.post(url, json=payload, timeout=15)
        response.raise_for_status()
        data = response.json()
        
        # Extract URLs from Tavily results
        for result in data.get("results", [])[:5]:
            if "url" in result:
                results.append(result["url"])
        
        logger.info(f"[Discovery] ✓ Tavily API search returned {len(results)} results")
    except Exception as e:
        logger.warning(f"[Discovery] Tavily API search failed: {str(e)}")
    
    return results


def search_via_api(query: str) -> List[str]:
    """
    Search using Tavily API instead of Selenium (avoids detection completely).
    Falls back to API-based search when Google blocks you.
    """
    return search_tavily_api(query)


def search_serp_api(product_name: str, barcode: str, msp: float = None) -> List[Dict]:
    """
    Search using SERP API Google Shopping Light Engine.
    Uses direct HTTP requests to SERP API (more reliable than library).
    
    Args:
        product_name: Product name for search
        barcode: Product barcode
        msp: Minimum Selling Price (optional, used for violation detection)
    
    Returns:
        List of dictionaries with product and vendor information
    """
    results = []
    
    try:
        logger.info("[Discovery SERP] Using SERP API Google Shopping Light Engine for search")
        api_key = os.environ.get('SERP_API_KEY')
        
        # Check if API key is configured
        if not api_key:
            logger.error("[Discovery SERP] ❌ SERP_API_KEY not found in environment variables")
            logger.error("[Discovery SERP] Please add SERP_API_KEY=<your_key> to .env file")
            return results
            
        if api_key == 'your_serp_api_key_here':
            logger.error("[Discovery SERP] ❌ SERP_API_KEY is set to placeholder value")
            logger.error("[Discovery SERP] Please replace 'your_serp_api_key_here' with your actual SERP API key")
            return results
        
        # Build search query with product name only (better SERP results)
        # Using only product name for broader search results
        search_query = f"{product_name}"
        #search_query = f"{product_name} {barcode}"
        logger.info(f"[Discovery SERP] Search query: '{search_query}'")
        
        # Direct HTTP request to SERP API (more reliable than GoogleSearch library)
        serp_url = "https://serpapi.com/search.json"
        
        params = {
            "engine": "google_shopping_light",  # Correct engine for shopping light
            "q": search_query,  # Query with only product name for broader search results
            "gl": "es",  # Country code: Spain (es), can be changed (us, uk, fr, etc.)
            "api_key": api_key
        }
        
        try:
            response = requests.get(serp_url, params=params, timeout=30)
            response.raise_for_status()
            data = response.json()
        except requests.exceptions.RequestException as req_error:
            logger.error(f"[Discovery SERP] HTTP request failed: {str(req_error)}")
            return results
        except ValueError as json_error:
            logger.error(f"[Discovery SERP] Failed to parse JSON response: {str(json_error)}")
            return results
        
        # Check for empty response
        if not data:
            return results
        
        # Check for API errors
        if "error" in data:
            logger.error(f"[Discovery SERP] API Error: {data.get('error')}")
            return results
        
        # Extract shopping results from SERP API response
        shopping_results = data.get("shopping_results", [])
        
        # If no shopping results, log response structure and try fallback
        if not shopping_results:
            logger.warning(f"[Discovery SERP] ⚠️ No 'shopping_results' key in response")
            
            # Check for alternative result keys
            for alt_key in ["results", "products", "items", "search_results", "organic_results"]:
                if alt_key in data and data[alt_key]:
                    pass  # Found alternative key but skipping verbose logging
            
            # Suggest alternative search strategies
            logger.warning(f"[Discovery SERP] No results found. Trying fallback: searching with product name only...")
            
            # Fallback 1: Try with just product name (no barcode)
            if barcode and " " in search_query:
                name_only_query = product_name.strip()
                
                try:
                    fallback_params = {
                        "engine": "google_shopping_light",
                        "q": name_only_query,
                        "api_key": api_key
                    }
                    fallback_response = requests.get(serp_url, params=fallback_params, timeout=30)
                    fallback_response.raise_for_status()
                    fallback_data = fallback_response.json()
                    fallback_results = fallback_data.get("shopping_results", [])
                    
                    if fallback_results:
                        logger.info(f"[Discovery SERP] ✓ Fallback search successful! Found {len(fallback_results)} results with product name only")
                        shopping_results = fallback_results
                    else:
                        logger.warning(f"[Discovery SERP] ⚠️ Fallback search also returned no shopping_results")
                except Exception as fallback_error:
                    logger.warning(f"[Discovery SERP] Fallback search failed: {str(fallback_error)}")
        
        # Process shopping results
        for idx, result in enumerate(shopping_results[:15]):  # Process up to 15 results
            try:
                # Parse SERP API Google Shopping Light result format
                # Shopping light results have: source, product_link, extracted_price, title, etc.
                vendor_name = result.get("source", "Unknown")  # Source/vendor name
                product_link = result.get("product_link", "") or result.get("link", "")  # Product URL (try product_link first, then link)
                title = result.get("title", "")  # Product title
                
                # Price: prefer extracted_price (already parsed), fallback to price string
                price = result.get("extracted_price")  # Already a float if available
                if price is None:
                    price_str = result.get("price", "")  # Fallback to string price
                    if price_str:
                        try:
                            # Remove currency symbols and whitespace, convert comma to dot
                            price_clean = str(price_str).replace("$", "").replace("€", "").replace("₹", "").replace(",", ".").strip()
                            price = float(price_clean)
                        except (ValueError, AttributeError, TypeError):
                            price = None
                
                if not product_link:
                    continue
                
                # NOTE: SERP API google_shopping_light returns Google Shopping links in product_link
                # This is expected behavior - we use source (vendor name) + price for violation detection
                # Google Shopping redirects are acceptable as they still link to the product with vendor info
                
                result_dict = {
                    "vendor_name": vendor_name,
                    "vendor_url": product_link,
                    "scraped_price": price,
                    "product_title": title,
                    "source": "serp_api_shopping_light"
                }
                
                # Determine compliance status
                if price and msp:
                    if price < msp:
                        result_dict["status"] = "violation"
                        result_dict["compliance_status"] = "violation"
                    else:
                        result_dict["status"] = "compliant"
                        result_dict["compliance_status"] = "compliant"
                else:
                    result_dict["status"] = "unknown"
                    result_dict["compliance_status"] = "unknown"
                
                results.append(result_dict)
                logger.info(f"[Discovery SERP] ✓ Result {idx + 1}: {vendor_name} - {product_link[:60]} - Price: {f'₹{price:.2f}' if price else 'N/A'} - Status: {result_dict['status'].upper()}")
                
            except Exception as e:
                logger.warning(f"[Discovery SERP] Error parsing result {idx}: {str(e)}")
                continue
        
        logger.info(f"[Discovery SERP] ✓ SERP API search completed. Processed {min(15, len(shopping_results))} out of {len(shopping_results)} available results, returned {len(results)} for analysis")
        
    except Exception as e:
        logger.error(f"[Discovery SERP] SERP API search failed: {str(e)}", exc_info=True)
    
    return results


def ensure_chrome_dependencies():
    """Ensure required system dependencies are installed for Chrome on Linux."""
    system = platform.system()
    if system != "Linux":
        return True
    
    logger.info("[Chrome Dependencies] Checking Chrome and system utilities on Linux...")
    
    # Check for basic shell commands that Chrome needs
    required_commands = ['readlink', 'dirname', 'cat', 'basename', 'grep', 'sed', 'awk']
    missing_commands = []
    
    for cmd in required_commands:
        try:
            result = subprocess.run(['which', cmd], capture_output=True, timeout=2)
            if result.returncode != 0:
                missing_commands.append(cmd)
        except Exception:
            missing_commands.append(cmd)
    
    if missing_commands:
        logger.error(f"[Chrome Dependencies] ✗ Missing core shell commands: {', '.join(missing_commands)}")
        logger.error("[Chrome Dependencies] DevOps must install: sudo apt-get install -y coreutils grep sed gawk")
        return False
    
    # Check if Chrome is accessible
    chrome_found = False
    try:
        result = subprocess.run(["which", "google-chrome"], capture_output=True, timeout=5)
        if result.returncode == 0:
            chrome_path = result.stdout.decode().strip()
            logger.info(f"[Chrome Dependencies] ✓ Chrome found in PATH: {chrome_path}")
            chrome_found = True
    except Exception as e:
        logger.debug(f"[Chrome Dependencies] Could not check PATH: {str(e)}")
    
    # Try alternate paths
    if not chrome_found:
        paths = ["/usr/bin/google-chrome", "/usr/bin/google-chrome-stable", "/usr/bin/chromium"]
        for path in paths:
            if os.path.exists(path):
                logger.info(f"[Chrome Dependencies] ✓ Chrome found at: {path}")
                chrome_found = True
                break
    
    if not chrome_found:
        logger.error("[Chrome Dependencies] ✗ Chrome NOT found - DevOps must install Chrome")
        return False
    
    # Test Chrome --version (this will catch the shell command issues)
    try:
        result = subprocess.run(["google-chrome", "--version"], capture_output=True, text=True, timeout=10)
        if result.returncode == 0 and result.stdout.strip():
            logger.info(f"[Chrome Dependencies] ✓ Chrome executable: {result.stdout.strip()}")
            return True
        else:
            logger.error(f"[Chrome Dependencies] ✗ Chrome --version failed: {result.stderr}")
            logger.error("[Chrome Dependencies] This indicates missing shell commands (readlink, dirname, cat)")
            return False
    except subprocess.TimeoutExpired:
        logger.error("[Chrome Dependencies] ✗ Chrome --version timed out (likely missing shell commands)")
        return False
    except Exception as e:
        logger.error(f"[Chrome Dependencies] ✗ Could not verify Chrome: {str(e)}")
        return False


# Thread pool executor for running synchronous scraping operations
_executor = ThreadPoolExecutor(max_workers=1)  # Single worker to avoid too many browser instances



def cleanup_chrome_session(driver):
    """Clean up Chrome session temporary files."""
    try:
        if hasattr(driver, '_user_data_dir') and driver._user_data_dir:
            user_data_dir = driver._user_data_dir
            if os.path.exists(user_data_dir):
                shutil.rmtree(user_data_dir, ignore_errors=True)
                logger.info(f"[Chrome Cleanup] Removed session directory: {user_data_dir}")
    except Exception as e:
        logger.warning(f"[Chrome Cleanup] Failed to cleanup: {str(e)}")


def get_domain_from_url(url: str) -> str:
    """Extract domain name from URL."""
    parsed = urlparse(url)
    return parsed.netloc.replace("www.", "")


def normalize_url_for_comparison(url: str) -> str:
    """
    Normalize a URL for comparison purposes.
    Removes www, trailing slashes, query parameters to compare similar URLs.
    """
    try:
        parsed = urlparse(url.lower().strip())
        # Remove www. from domain
        domain = parsed.netloc.replace("www.", "").split(":")[ 0]
        # Get path without trailing slash
        path = parsed.path.rstrip("/")
        # Return domain + path (ignore query params and fragments)
        return f"{domain}{path}"
    except:
        return url.lower().strip()


def normalize_price(price_str: str) -> Optional[float]:
    """Normalize price string to float."""
    try:
        clean = re.sub(r"[^\d.,]", "", str(price_str).strip())
        return float(clean.replace(",", "."))
    except (ValueError, AttributeError):
        return None


def calculate_price_difference(msp: float, scraped_price: float) -> Tuple[float, float]:
    """Calculate price difference and percentage difference."""
    difference = round(msp - scraped_price, 2)
    percentage = round((difference / msp * 100) if msp > 0 else 0, 2)
    return difference, percentage


def determine_compliance_status(msp: float, scraped_price: Optional[float]) -> str:
    """Determine compliance status based on price comparison."""
    if scraped_price is None:
        return "unknown"
    if scraped_price < msp:
        return "violation"
    elif scraped_price > msp:
        return "complain"
    else:
        return "compliant"


async def check_duplicate_violation(
    db: AsyncSession,
    url: str,
    scraped_price: Optional[float],
    barcode: str,
    product_name: str,
    msp: float,
    vendor_name: Optional[str] = None
) -> bool:
    """
    Check if a violation record with MATCHING vendor already exists for this product/price.
    Prevents duplicates created by finding the same vendor from different sources
    (e.g., registered vendor scraping vs SERP API discovery).
    
    Considers it a duplicate if BOTH of these match:
    - product_name
    - barcode_number
    - msp
    - scraped_price
    - vendor_name (NOT url, since same vendor can be found via different URLs)
    
    This prevents duplicate violations for the same vendor selling at the same price.
    
    Returns True if duplicate found, False otherwise.
    """
    from sqlalchemy import and_
    
    try:
        if scraped_price is None or not barcode or not product_name:
            return False
        
        # If vendor_name is provided, check by vendor_name (more flexible for different URL sources)
        if vendor_name:
            stmt = select(Violation).where(
                and_(
                    Violation.barcode_number == barcode,
                    Violation.product_name == product_name,
                    Violation.msp == msp,
                    Violation.scraped_price == scraped_price,
                    Violation.vendor_name == vendor_name
                )
            )
            result = await db.execute(stmt)
            existing = result.scalars().first()
            
            if existing:
                logger.info(f"[Duplicate] Vendor match found: {vendor_name} selling {product_name} (barcode: {barcode}) @ {scraped_price} (MSP: {msp})")
                return True
        
        # Fallback to URL-based check if vendor_name not provided
        if url:
            stmt = select(Violation).where(
                and_(
                    Violation.barcode_number == barcode,
                    Violation.url == url,
                    Violation.scraped_price == scraped_price,
                    Violation.product_name == product_name,
                    Violation.msp == msp
                )
            )
            result = await db.execute(stmt)
            existing = result.scalars().first()
            
            if existing:
                logger.info(f"[Duplicate] Exact URL match found: {product_name} (barcode: {barcode}), URL: {url[:60]}, Price: {scraped_price}")
                return True
        
        return False
    except Exception as e:
        logger.warning(f"[Duplicate Check] Error: {str(e)}")
        # On error, allow creation to avoid blocking
        return False


def check_disk_space(path: str = "/tmp", min_gb: int = 1) -> bool:
    """Check if there's enough disk space for Chrome."""
    try:
        import shutil
        stat = shutil.disk_usage(path)
        available_gb = stat.free / (1024 ** 3)
        
        if available_gb < min_gb:
            logger.error(f"[Disk Space] ✗ Insufficient space: {available_gb:.2f}GB available, need {min_gb}GB")
            return False
        
        logger.info(f"[Disk Space] ✓ Available: {available_gb:.2f}GB (threshold: {min_gb}GB)")
        return True
    except Exception as e:
        logger.warning(f"[Disk Space] Could not check disk space: {str(e)}")
        return True  # Allow to proceed if check fails


def cleanup_orphaned_chrome_processes():
    """Kill any orphaned Chrome/ChromeDriver processes."""
    system = platform.system()
    if system != "Linux":
        return
    
    try:
        # Kill any lingering Chrome processes
        subprocess.run(["pkill", "-9", "-f", "chrome"], capture_output=True, timeout=5)
        logger.info("[Process Cleanup] Killed orphaned Chrome processes")
    except Exception as e:
        logger.debug(f"[Process Cleanup] Could not kill processes: {str(e)}")


def initialize_selenium_driver(headless: bool = True, use_proxy: bool = False) -> webdriver.Chrome:
    """Initialize and configure Chrome WebDriver with anti-detection measures."""
    
    # Check critical preconditions
    if not ensure_chrome_dependencies():
        raise RuntimeError("Chrome dependencies check failed. DevOps must install Chrome via setup scripts.")
    
    if not check_disk_space("/tmp", min_gb=1):
        raise RuntimeError("Insufficient disk space in /tmp. DevOps must free up space.")
    
    # Clean up any orphaned processes
    cleanup_orphaned_chrome_processes()
    
    options = Options()
    if headless:
        options.add_argument("--headless=new")
        options.add_argument("--window-size=1920,1080")
        options.add_argument("--start-maximized")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-setuid-sandbox")
        options.add_argument("--disable-dev-shm-usage")
    
    # Proxy support for rotating IPs
    if use_proxy:
        proxy_url = get_free_proxy()
        if proxy_url:
            proxy_url = get_selenium_proxy_url(proxy_url)
            options.add_argument(f"--proxy-server={proxy_url}")
            logger.info(f"[Chrome Init] Using proxy: {proxy_url[:30]}")
    
    # Core stability and crash prevention flags
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-software-rasterizer")
    options.add_argument("--incognito")
    options.add_argument("--disable-gpu-sandbox")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-plugins")
    options.add_argument("--disable-plugins-discovery")
    options.add_argument("--disable-print-preview")
    options.add_argument("--disable-component-extensions-with-background-pages")
    options.add_argument("--no-default-browser-check")
    options.add_argument("--disable-background-networking")
    options.add_argument("--disable-sync")
    options.add_argument("--disable-translate")
    options.add_argument("--hide-scrollbars")
    options.add_argument("--metrics-recording-only")
    options.add_argument("--mute-audio")
    options.add_argument("--no-first-run")
    options.add_argument("--safebrowsing-disable-auto-update")
    options.add_argument("--disable-accelerated-2d-canvas")
    options.add_argument("--no-zygote")
    options.add_argument("--disable-background-timer-throttling")
    options.add_argument("--disable-backgrounding-occluded-windows")
    options.add_argument("--disable-renderer-backgrounding")
    options.add_argument("--disable-features=TranslateUI,IsolateOrigins,site-per-process")
    options.add_argument("--disable-ipc-flooding-protection")
    options.add_argument("--disable-default-apps")
    options.add_argument("--password-store=basic")
    options.add_argument("--use-mock-keychain")
    options.add_argument("--disable-web-security")
    options.add_argument("--allow-running-insecure-content")
    options.add_argument("--disable-webgl")
    options.add_argument("--disable-threaded-animation")
    options.add_argument("--disable-threaded-scrolling")
    options.add_argument("--disable-in-process-stack-traces")
    options.add_argument("--disable-histogram-customizer")
    options.add_argument("--disable-gl-extensions")
    options.add_argument("--disable-composited-antialiasing")
    options.add_argument("--disable-canvas-aa")
    options.add_argument("--disable-3d-apis")
    options.add_argument("--disable-accelerated-video-decode")
    options.add_argument("--disable-background-media-download")
    options.add_argument("--disable-domain-reliability")
    options.add_argument("--disable-client-side-phishing-detection")
    options.add_argument("--disable-component-update")
    options.add_argument("--disable-hang-monitor")
    options.add_argument("--disable-prompt-on-repost")
    options.add_argument("--force-fieldtrials=SiteIsolationExtensions/Control")
    options.add_argument("--disable-back-forward-cache")
    options.add_argument("--disable-popup-blocking")
    options.add_argument("--disable-session-crashed-bubble")
    options.add_argument("--disable-infobars")
    
    # Crash prevention and resource management flags
    options.add_argument("--disable-breakpad")
    options.add_argument("--disable-crash-reporter")
    options.add_argument("--disable-default-tracing")
    options.add_argument("--disable-media-session")
    options.add_argument("--no-service-autorun")
    options.add_argument("--disable-audio")
    options.add_argument("--disable-features=VizDisplayCompositor,AudioServiceOutOfProcess")
    options.add_argument("--disable-preconnect")
    options.add_argument("--disable-client-hints")
    
    # Debugging and temporary storage
    options.add_argument("--remote-debugging-port=9222")
    options.add_argument("--remote-debugging-address=0.0.0.0")
    options.add_argument("--crash-dumps-dir=/tmp")
    options.add_argument("--data-path=/tmp/chrome-data")
    options.add_argument("--disk-cache-dir=/tmp/chrome-cache")
    options.add_argument("--disable-logging")
    options.add_argument("--disable-logging-redirect")
    options.add_argument("--log-level=3")
    
    # Single process mode to reduce memory overhead
    options.add_argument("--single-process")
    
    # Set random user agent from the list to avoid detection
    user_agent = random.choice(GOOGLE_USER_AGENTS)
    logger.info(f"[Chrome Init] Using user agent: {user_agent[:50]}...")
    
    options.add_argument(f"--user-agent={user_agent}")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    
    # Set environment variables for Chrome
    os.environ['DISPLAY'] = ':99'  # Virtual display
    os.environ['DBUS_SESSION_BUS_ADDRESS'] = '/dev/null'
    
    # Create unique user data directory for this session
    import uuid
    unique_session_id = str(uuid.uuid4())[:8]
    user_data_dir = f"/tmp/chrome-session-{unique_session_id}"
    
    # Try to create necessary directories
    try:
        os.makedirs(user_data_dir, exist_ok=True)
        os.makedirs('/tmp/chrome-cache', exist_ok=True)
        os.makedirs('/tmp/chrome-data', exist_ok=True)
    except Exception as e:
        logger.warning(f"Could not create Chrome temp directories: {str(e)}")
    
    options.add_argument(f"--user-data-dir={user_data_dir}")
    
    # Method 1: Check environment variable
    chrome_path = os.environ.get('CHROME_BIN') or os.environ.get('GOOGLE_CHROME_BIN')
    if chrome_path:
        logger.info(f"[Chrome Detection] Found via environment variable: {chrome_path}")
    
    # Method 2: Check if chrome is in PATH
    if not chrome_path:
        for cmd in ['google-chrome', 'chromium-browser', 'chromium', 'chrome']:
            found = shutil.which(cmd)
            if found:
                chrome_path = found
                logger.info(f"[Chrome Detection] Found in PATH using 'which {cmd}': {chrome_path}")
                break
    
    # Method 3: Check common installation paths
    if not chrome_path:
        system = platform.system()
        logger.info(f"[Chrome Detection] Checking common paths for {system}...")
        
        if system == "Windows":
            possible_paths = [
                r"C:\Program Files\Google\Chrome\Application\chrome.exe",
                r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"
            ]
        elif system == "Linux":
            possible_paths = [
                "/usr/bin/google-chrome",        # Chrome first (main)
                "/usr/bin/google-chrome-stable", 
                "/usr/bin/chromium-browser",     # Chromium fallback
                "/usr/bin/chromium",
                "/opt/google/chrome/chrome",
                "/snap/bin/chromium"
            ]
        elif system == "Darwin":  # macOS
            possible_paths = [
                "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
                "/Applications/Chromium.app/Contents/MacOS/Chromium"
            ]
        else:
            possible_paths = []
            
        for path in possible_paths:
            if os.path.exists(path):
                chrome_path = path
                logger.info(f"[Chrome Detection] Found at path: {chrome_path}")
                break
        
        if not chrome_path:
            logger.warning("[Chrome Detection] Chrome not found in common paths")
    
    # Verify Chrome is executable and accessible
    if chrome_path:
        logger.info(f"[Chrome Verification] Verifying Chrome at: {chrome_path}")
        try:
            # Check if file exists
            if os.path.exists(chrome_path):
                logger.info(f"[Chrome Verification] ✓ File exists at {chrome_path}")
                # Try to get version
                result = subprocess.run([chrome_path, "--version"], capture_output=True, text=True, timeout=5)
                if result.returncode == 0:
                    logger.info(f"[Chrome Verification] ✓ Chrome is executable: {result.stdout.strip()}")
                else:
                    logger.warning(f"[Chrome Verification] ⚠ Chrome returned error: {result.stderr}")
            else:
                logger.error(f"[Chrome Verification] ✗ File not found: {chrome_path}")
                chrome_path = None
        except subprocess.TimeoutExpired:
            logger.warning("[Chrome Verification] ⚠ Chrome --version timed out")
        except Exception as e:
            logger.warning(f"[Chrome Verification] ⚠ Could not verify Chrome: {str(e)}")
    
    # Set the binary location if found
    if chrome_path:
        options.binary_location = chrome_path
        logger.info(f"[Chrome Init] Using Chrome binary: {chrome_path}")
    else:
        logger.warning("[Chrome Init] Chrome binary not found. Selenium will attempt auto-detection.")

    try:
        logger.info("[Chrome Init] Installing/verifying ChromeDriver with ChromeDriverManager...")
        chromedriver_path = ChromeDriverManager().install()
        logger.info(f"[Chrome Init] ChromeDriver ready at: {chromedriver_path}")
        
        service = Service(chromedriver_path)
        logger.info("[Chrome Init] Creating Selenium WebDriver...")
        driver = webdriver.Chrome(service=service, options=options)
        logger.info("[Chrome Init] ✓ WebDriver initialized successfully")
        # Store the user data directory for cleanup
        driver._user_data_dir = user_data_dir
        return driver
        
    except Exception as e:
        logger.error(f"[Chrome Init] ✗ Failed to initialize WebDriver: {str(e)}", exc_info=True)
        # Cleanup temp directory if initialization failed
        try:
            if 'user_data_dir' in locals() and os.path.exists(user_data_dir):
                shutil.rmtree(user_data_dir, ignore_errors=True)
                logger.info(f"[Chrome Cleanup] Removed temp directory: {user_data_dir}")
        except Exception:
            pass
        raise



def cleanup_chrome_session(driver):
    """Clean up Chrome session temporary files."""
    try:
        if hasattr(driver, '_user_data_dir') and driver._user_data_dir:
            user_data_dir = driver._user_data_dir
            if os.path.exists(user_data_dir):
                shutil.rmtree(user_data_dir, ignore_errors=True)
                logger.info(f"[Chrome Cleanup] Removed session directory: {user_data_dir}")
    except Exception as e:
        logger.warning(f"[Chrome Cleanup] Failed to cleanup: {str(e)}")



def dismiss_overlays(driver):
    selectors = [
        "button.cookie-accept",
        "button#cookie-accept",
        "button.accept",
        "button.btn-accept",
        "button[aria-label='Close']",
        "button.close",
        ".modal button.close",
        ".popup button.close",
        ".cookie-consent button",
        "div#cookie-consent button",
        "button[title='Close']",
    ]
    for sel in selectors:
        try:
            elems = driver.find_elements(By.CSS_SELECTOR, sel)
            for el in elems:
                if el.is_displayed():
                    el.click()
                    time.sleep(0.3)
        except Exception:
            pass
    try:
        driver.switch_to.active_element.send_keys(Keys.ESCAPE)
    except Exception:
        pass


def find_and_search(driver, search_query: str, vendor_name: str = "") -> bool:
    """Find and interact with search input on vendor websites."""
    search_selectors = [
        ("xpath", "//input[@type='search']", "search input type"),
        ("xpath", "//input[@placeholder[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'search')]]", "search placeholder XPath"),
        ("xpath", "//input[@name='search' or @name='q' or @name='keyword' or @name='searchInput']", "named search input"),
        ("css", "input[type='search']", "CSS search input"),
        ("css", "[placeholder*='search'], [placeholder*='Search']", "CSS search placeholder"),
        ("css", "input[placeholder*='buscar'], input[placeholder*='Buscar']", "Spanish buscar"),
        ("xpath", "//form//input[1]", "first form input"),
        ("xpath", "//input[@class[contains(., 'search')]]", "search class input"),
        ("css", "input.search, input.search-box, input.searchbox", "common search classes"),
    ]

    for selector_type, selector, description in search_selectors:
        try:
            if selector_type == "xpath":
                element = driver.find_element(By.XPATH, selector)
            else:
                element = driver.find_element(By.CSS_SELECTOR, selector)

            # Check if element is visible and enabled
            if not element.is_displayed() or not element.is_enabled():
                continue

            # Click and focus
            try:
                element.click()
                human_delay(0.3, 0.6)
            except:
                pass
            
            # Clear existing text
            try:
                element.clear()
            except:
                pass
            
            # Type search query
            element.send_keys(search_query)
            human_delay(0.8, 1.2)
            
            # Try multiple submit methods
            submit_success = False
            
            # Method 1: Press Enter
            try:
                element.send_keys(Keys.RETURN)
                submit_success = True
            except Exception as e:
                pass
            
            # Method 2: Form submission
            if not submit_success:
                try:
                    element.submit()
                    submit_success = True
                except Exception as e:
                    pass
            
            # Method 3: Find and click search button
            if not submit_success:
                button_selectors = [
                    ("//button[contains(., 'Search')]", "Search button text"),
                    ("//button[contains(., 'Buscar')]", "Buscar button text"),
                    ("//button[@type='submit']", "submit button"),
                    ("//input[@type='submit']", "submit input"),
                    ("//button[contains(@class, 'search')]", "search class button"),
                ]
                for btn_selector, btn_desc in button_selectors:
                    try:
                        buttons = driver.find_elements(By.XPATH, btn_selector)
                        if buttons and buttons[0].is_displayed():
                            buttons[0].click()
                            logger.info(f"[Search] Submitted with {btn_desc}")
                            submit_success = True
                            break
                    except Exception:
                        continue
            
            if submit_success:
                human_delay(2, 3)
                return True
            else:
                continue

        except Exception as e:
            continue

    return False


def extract_price_from_html(page_source: str) -> Optional[float]:
    """Extract price from HTML using multiple strategies."""
    soup = BeautifulSoup(page_source, "html.parser")
    scraped_price = None

    # Strategy 1: JSON-LD Structured Data
    scripts = soup.find_all("script", type="application/ld+json")

    for script in scripts:
        try:
            if not script.string:
                continue
            data = json.loads(script.string)

            def find_price(obj):
                if isinstance(obj, dict):
                    if obj.get("@type") == "Offer" and "price" in obj:
                        try:
                            return float(obj["price"])
                        except (ValueError, TypeError):
                            pass
                    if "offers" in obj:
                        offers = obj["offers"]
                        if isinstance(offers, dict) and "price" in offers:
                            try:
                                return float(offers["price"])
                            except (ValueError, TypeError):
                                pass
                    for value in obj.values():
                        result = find_price(value)
                        if result:
                            return result
                elif isinstance(obj, list):
                    for item in obj:
                        result = find_price(item)
                        if result:
                            return result
                return None

            scraped_price = find_price(data)
            if scraped_price and 1.0 <= scraped_price <= 5000.0:
                return scraped_price

        except Exception:
            continue

    # Strategy 2: itemprop="price"
    if not scraped_price:
        price_tag = soup.find(attrs={"itemprop": "price"})
        if price_tag:
            content = price_tag.get("content") or price_tag.text
            normalized = normalize_price(content)
            if normalized and 1.0 <= normalized <= 5000.0:
                scraped_price = normalized
                return scraped_price

    # Strategy 3: Intelligent fallback (median of all numeric values)
    if not scraped_price:
        text = soup.get_text(" ")
        matches = re.findall(r"\d+[.,]\d{2}", text)

        prices = []
        for m in matches:
            normalized = normalize_price(m)
            if normalized and 1.0 <= normalized <= 5000.0:
                prices.append(normalized)

        if prices:
            prices = sorted(list(set(prices)))
            scraped_price = prices[len(prices) // 2]
            return scraped_price

    return scraped_price




def search_google(driver, query: str, max_retries: int = 3) -> bool:
    """Search on Google with retry logic and exponential backoff."""
    for attempt in range(max_retries):
        try:
            # Extended Google search box selectors with more variations
            search_selectors = [
                # Standard Google selectors
                (By.NAME, "q"),
                (By.CSS_SELECTOR, "textarea[name='q']"),
                (By.CSS_SELECTOR, "input[name='q']"),
                (By.XPATH, "//textarea[@name='q']"),
                (By.XPATH, "//input[@name='q']"),
                # Alternative selectors
                (By.CSS_SELECTOR, "[aria-label='Search']"),
                (By.CSS_SELECTOR, "input[aria-label='Search']"),
                (By.XPATH, "//*[@aria-label='Search']"),
                (By.CSS_SELECTOR, "[role='searchbox']"),
                (By.XPATH, "//*[@role='combobox' and @aria-owns]"),
                # Fallback to any visible input in the body
                (By.CSS_SELECTOR, "body input[type='text']"),
            ]
            
            search_box = None
            for selector_type, selector_value in search_selectors:
                try:
                    elements = driver.find_elements(selector_type, selector_value)
                    for element in elements:
                        try:
                            if element.is_displayed():
                                search_box = element
                                logger.info(f"[Discovery] Found search box using {selector_type}: {selector_value}")
                                break
                        except:
                            continue
                    if search_box:
                        break
                except:
                    continue
            
            if not search_box:
                logger.warning("[Discovery] Could not find Google search box with any selector")
                logger.debug(f"[Discovery] Attempted {len(search_selectors)} selector combinations")
                return False
            
            # Click on the search box first to focus it
            try:
                search_box.click()
                human_delay(0.3, 0.7)
            except:
                pass
            
            # Clear and type the query
            try:
                search_box.clear()
            except:
                # If clear fails, just proceed
                pass
            
            search_box.send_keys(query)
            logger.info(f"[Discovery] Typed query: {query}")
            human_delay(0.8, 1.5)
            
            # Try multiple submit methods
            submit_success = False
            
            # Method 1: Press Enter
            try:
                search_box.send_keys(Keys.RETURN)
                logger.info("[Discovery] Pressed Enter")
                submit_success = True
            except:
                logger.debug("[Discovery] Enter key failed, trying alternate methods")
            
            # Method 2: Try Ctrl+Enter (alternative for some sites)
            if not submit_success:
                try:
                    search_box.send_keys(Keys.CONTROL, Keys.RETURN)
                    logger.info("[Discovery] Pressed Ctrl+Enter")
                    submit_success = True
                except:
                    pass
            
            # Method 3: Try to find and click search button (multiple selectors)
            if not submit_success:
                button_selectors = [
                    "//button[@aria-label='Google Search']",
                    "//button[@aria-label='google search']",
                    "//button[contains(@aria-label, 'Search')]",
                    "//button[contains(text(), 'Search')]",
                    "//input[@type='submit' and @value='Google Search']",
                    "//input[@type='submit']",
                    "//button[@type='submit']",
                    "//button[contains(@class, 'search')]",
                    "//button[contains(@class, 'btn') and contains(@class, 'search')]",
                    "//button[@jsaction]",
                    "//button[contains(@data-ved, '0ahUKE')]",  # Common Google button attribute
                    "//div[@role='button' and contains(., 'Search')]",
                    "//span[contains(text(), 'Search')]/parent::button",
                    "//span[contains(text(), 'Buscar')]/parent::button",  # Spanish for Search
                ]
                for btn_selector in button_selectors:
                    try:
                        buttons = driver.find_elements(By.XPATH, btn_selector)
                        if buttons and buttons[0].is_displayed():
                            buttons[0].click()
                            logger.info(f"[Discovery] Clicked search button using selector: {btn_selector}")
                            submit_success = True
                            break
                    except Exception:
                        continue
            
            # Method 4: Try form submission
            if not submit_success:
                try:
                    search_box.submit()
                    logger.info("[Discovery] Submitted form")
                    submit_success = True
                except Exception:
                    pass
            
            # Method 5: JavaScript form submission as last resort
            if not submit_success:
                try:
                    # Find the form containing the search box
                    form = search_box.find_element(By.XPATH, "ancestor::form")
                    if form:
                        driver.execute_script("arguments[0].submit();", form)
                        logger.info("[Discovery] Submitted form via JavaScript")
                        submit_success = True
                except Exception:
                    pass
            
            if not submit_success:
                logger.warning("[Discovery] Could not submit search using any method")
                if attempt < max_retries - 1:
                    exponential_backoff_delay(attempt)
                    continue
                return False
            
            human_delay(3, 5)
            return True
        
        except Exception as e:
            logger.error(f"[Discovery] Error during Google search (attempt {attempt + 1}/{max_retries}): {str(e)}")
            if attempt < max_retries - 1:
                exponential_backoff_delay(attempt)
            else:
                return False
    
    return False


def discover_alternative_vendors(barcode: str, product_name: str, msp: float, registered_domains: set) -> List[Dict]:
    """
    Discover alternative vendors via search (Google/Bing) and scrape their prices.
    Uses multiple fallback strategies when Google blocks the request.
    """
    discovered_vendors = []
    search_query = f"{barcode} {product_name[:60]} precio comprar online"
    logger.info(f"[Discovery] Starting search with query: {search_query}")
    logger.info(f"[Discovery] Registered domains to exclude: {registered_domains}")

    # Strategy 1: Try Google first with proxies and proper delays
    logger.info("[Discovery] Strategy 1: Attempting Google Search...")
    driver = None
    try:
        driver = initialize_selenium_driver(use_proxy=False)  # Start without proxy
        
        # Navigate to Google - try specific locale first (Spain), then fallback to generic
        google_urls = [
            "https://www.google.es",  # Spain (for Spanish products)
            "https://www.google.com", # Fallback
        ]
        
        for google_url in google_urls:
            try:
                logger.info(f"[Discovery] Loading {google_url}...")
                driver.get(google_url)
                human_delay(2, 3.5)  # Longer initial delay
                
                # Check if page loaded
                if "Google" in driver.title or len(driver.page_source) > 1000:
                    logger.info(f"[Discovery] ✓ Loaded {google_url}")
                    # Add human-like behavior: scroll a bit
                    try:
                        driver.execute_script("window.scrollTo(0, window.innerHeight / 4);")
                        human_delay(0.5, 1.0)
                        driver.execute_script("window.scrollTo(0, 0);")
                        human_delay(0.5, 1.0)
                    except Exception:
                        pass
                    break
            except Exception as e:
                logger.warning(f"[Discovery] Could not load {google_url}: {str(e)}")
                continue
        
        # Try to dismiss cookie consent (non-critical)
        try:
            dismiss_buttons = [
                "//button[contains(., 'Aceptar')]",
                "//button[contains(., 'Accept')]",
                "//button[contains(., 'all')]",
                "//button[@jsaction]",
            ]
            for selector in dismiss_buttons:
                try:
                    buttons = driver.find_elements(By.XPATH, selector)
                    for btn in buttons:
                        try:
                            if btn.is_displayed() and btn.is_enabled():
                                btn.click()
                                logger.debug("[Discovery] Dismissed cookie consent")
                                human_delay(0.5, 1.0)
                                break
                        except:
                            pass
                except:
                    pass
        except Exception as e:
            logger.debug(f"[Discovery] Cookie dismissal skipped: {str(e)}")
        
        # Search on Google with retries
        logger.info(f"[Discovery] Attempting Google search for: {search_query}")
        google_success = search_google(driver, search_query, max_retries=2)
        
        # Check for CAPTCHA or blocks
        current_url = driver.current_url.lower()
        page_source = driver.page_source.lower()
        
        captcha_indicators = [
            "sorry", "recaptcha", "robot", "automated", "blocked", "unusual traffic",
            "verify", "captcha", "puzzle", "challenge"
        ]
        
        google_blocked = any(indicator in current_url or indicator in page_source for indicator in captcha_indicators)
        
        if google_blocked:
            logger.warning("[Discovery] ⚠ CAPTCHA or block detected on Google")
            google_success = False
        
        # Parse results if Google succeeded
        if google_success and not google_blocked:
            try:
                html = driver.page_source
                soup = BeautifulSoup(html, "html.parser")
                
                # Extract search result links
                result_links = []
                all_links = soup.find_all("a", href=True)
                logger.info(f"[Discovery] Found {len(all_links)} total links on page")
                
                for link in all_links:
                    href = link.get("href", "")
                    # Filter out Google's own links
                    if href.startswith("http") and "google" not in href.lower() and not any(x in href for x in ["/ads/", "/aclk", "webcache"]):
                        try:
                            domain = urlparse(href).netloc.replace("www.", "")
                            if domain and domain not in registered_domains:
                                result_links.append(href)
                                logger.debug(f"[Discovery] Found link: {domain}")
                        except Exception as e:
                            logger.debug(f"[Discovery] Error parsing link: {str(e)}")
                
                # Deduplicate and limit
                result_links = list(set(result_links))[:5]
                logger.info(f"[Discovery] Will scrape {len(result_links)} unique vendor links from Google")
                
                # Scrape each discovered link
                for idx, link in enumerate(result_links, 1):
                    try:
                        logger.info(f"[Discovery {idx}/{len(result_links)}] Scraping: {link[:100]}")
                        driver.get(link)
                        
                        try:
                            WebDriverWait(driver, 15).until(
                                lambda d: d.execute_script("return document.readyState") == "complete"
                            )
                        except:
                            pass
                        
                        human_delay(1.5, 2.5)
                        
                        page_source = driver.page_source
                        disc_price = extract_price_from_html(page_source)
                        
                        if disc_price:
                            domain = get_domain_from_url(link)
                            diff, perc_diff = calculate_price_difference(msp, disc_price)
                            status = determine_compliance_status(msp, disc_price)
                            
                            discovered_vendors.append({
                                "vendor_id": None,
                                "vendor_name": domain,
                                "vendor_url": link,
                                "scraped_price": disc_price,
                                "msp": msp,
                                "status": status,
                                "price_difference": diff,
                                "percentage_difference": perc_diff,
                            })
                            logger.info(f"[Discovery] ✓ Price {disc_price} found at {domain}")
                        else:
                            logger.debug(f"[Discovery] No price found at {link}")
                        
                    except Exception as e:
                        logger.warning(f"[Discovery] Error scraping {link}: {str(e)}")
                    
                    human_delay(2, 4)  # Longer delays between vendor scrapes
                
            except Exception as e:
                logger.error(f"[Discovery] Error parsing Google results: {str(e)}")
        
    except Exception as e:
        logger.error(f"[Discovery] Google strategy error: {str(e)}", exc_info=True)
    finally:
        if driver:
            try:
                cleanup_chrome_session(driver)
                driver.quit()
            except:
                pass
    
    # Strategy 2: Fallback to Tavily API if Google fails (no browser needed, pure API)
    if not discovered_vendors or google_blocked:
        logger.info("[Discovery] Strategy 2: Google failed/blocked, attempting Tavily API Search...")
        api_links = search_tavily_api(search_query)
        
        if api_links:
            driver = None
            try:
                driver = initialize_selenium_driver(use_proxy=False)
                for idx, link in enumerate(api_links, 1):
                    try:
                        logger.info(f"[Discovery-Tavily {idx}/{len(api_links)}] Scraping: {link[:100]}")
                        driver.get(link)
                        
                        try:
                            WebDriverWait(driver, 15).until(
                                lambda d: d.execute_script("return document.readyState") == "complete"
                            )
                        except:
                            pass
                        
                        human_delay(1.5, 2.5)
                        
                        page_source = driver.page_source
                        disc_price = extract_price_from_html(page_source)
                        
                        if disc_price:
                            domain = get_domain_from_url(link)
                            diff, perc_diff = calculate_price_difference(msp, disc_price)
                            status = determine_compliance_status(msp, disc_price)
                            
                            discovered_vendors.append({
                                "vendor_id": None,
                                "vendor_name": domain,
                                "vendor_url": link,
                                "scraped_price": disc_price,
                                "msp": msp,
                                "status": status,
                                "price_difference": diff,
                                "percentage_difference": perc_diff,
                            })
                            logger.info(f"[Discovery] ✓ Price {disc_price} found via Tavily at {domain}")
                        else:
                            logger.debug(f"[Discovery] No price found at {link}")
                    
                    except Exception as e:
                        logger.warning(f"[Discovery] Tavily error scraping {link}: {str(e)}")
                    
                    human_delay(2, 4)
            
            except Exception as e:
                logger.warning(f"[Discovery] Tavily scraping error: {str(e)}")
            finally:
                if driver:
                    try:
                        cleanup_chrome_session(driver)
                        driver.quit()
                    except:
                        pass
        else:
            logger.warning("[Discovery] Tavily API returned no results")
    
    logger.info(f"[Discovery] ✓ Completed all strategies. Found {len(discovered_vendors)} alternative vendors")
    return discovered_vendors


def discover_alternative_vendors_serp(barcode: str, product_name: str, msp: float, registered_domains: set) -> List[Dict]:
    """
    Discover alternative vendors using SERP API Google Shopping Light Engine.
    Queries SERP API for shopping results and extracts vendor information with prices.
    Sequence: First vendors scraped, then SERP API used for discovery.
    
    Args:
        barcode: Product barcode
        product_name: Product name
        msp: Minimum Selling Price
        registered_domains: Set of already registered vendor domains to exclude
    
    Returns:
        List of discovered vendor dictionaries with price information
    """
    discovered_vendors = []
    
    # Build search query info (for logging)
    logger.info(f"\n{'='*80}")
    logger.info(f"[Discovery SERP] 🔍 STARTING SERP DISCOVERY")
    logger.info(f"[Discovery SERP] Product: {barcode} - {product_name}")
    logger.info(f"[Discovery SERP] Registered domains to exclude: {registered_domains}")
    logger.info(f"[Discovery SERP] Number of registered domains: {len(registered_domains)}")
    logger.info(f"{'='*80}\n")
    
    try:
        # Use SERP API Google Shopping Light to search for shopping results
        # Pass product_name and barcode directly (not a pre-built query)
        results = search_serp_api(product_name=product_name, barcode=barcode, msp=msp)
        
        logger.info(f"[Discovery SERP] ✓ search_serp_api() returned {len(results)} total results to process")
        if results:
            logger.info(f"[Discovery SERP] Sample vendor names from SERP:")
            for idx, result in enumerate(results[:3], 1):
                vendor = result.get("vendor_name", "Unknown")
                price = result.get("scraped_price", "N/A")
                logger.info(f"  [{idx}] {vendor} - Price: {price}")
        
        if not results:
            logger.warning("[Discovery SERP] SERP API returned no results")
            return discovered_vendors
        
        logger.info(f"[Discovery SERP] Processing {len(results)} SERP API results\n")
        
        # Process each SERP result
        for idx, result in enumerate(results, 1):
            try:
                vendor_url = result.get("vendor_url", "")
                vendor_name = result.get("vendor_name", "Unknown")
                product_title = result.get("product_title", "")
                scraped_price = result.get("scraped_price")
                source_type = result.get("source", "serp_api_shopping_light")
                status = result.get("status", "unknown")
                
                if not vendor_url:
                    logger.debug(f"[Discovery SERP] Skipping result {idx} ({vendor_name}): no URL")
                    continue
                
                # For SERP API results, check vendor_name against registered_domains (which now contains vendor names)
                # Since all SERP product_links are Google Shopping redirects, we skip domain extraction
                if vendor_name in registered_domains:
                    logger.info(f"[Discovery SERP] ⊘ Result {idx}: {vendor_name} - FILTERED (vendor already registered)")
                    continue
                
                # Also check if vendor_name is a substring of any registered domain
                # E.g., if "amazon.com" is registered, skip vendor name "amazon"
                matching_domain = None
                is_registered = False
                for domain in registered_domains:
                    if vendor_name.lower() in domain.lower() or domain.lower() in vendor_name.lower():
                        matching_domain = domain
                        is_registered = True
                        break
                
                if is_registered:
                    logger.info(f"[Discovery SERP] ⊘ Result {idx}: '{vendor_name}' - FILTERED")
                    logger.info(f"           Reason: Substring match with registered domain '{matching_domain}'")
                    logger.info(f"           Logic: '{vendor_name.lower()}' in '{matching_domain.lower()}' OR '{matching_domain.lower()}' in '{vendor_name.lower()}' = TRUE")
                    continue
                else:
                    logger.info(f"[Discovery SERP] ✓ Result {idx}: '{vendor_name}' PASSES")
                    logger.info(f"           Reason: No match against {len(registered_domains)} registered domains: {registered_domains}")
                
                # Build discovery result
                discovery_dict = {
                    "vendor_name": vendor_name,
                    "vendor_url": vendor_url,
                    "product_title": product_title,
                    "scraped_price": scraped_price,
                    "msp": msp,
                    "source_type": source_type,
                    "status": status,
                    "compliance_status": status,
                    "violation_date": datetime.now(),
                }
                
                # Add price difference if price exists
                if scraped_price:
                    price_diff = msp - scraped_price
                    discovery_dict["price_difference"] = price_diff
                    discovery_dict["percentage_difference"] = (price_diff / msp * 100) if msp > 0 else 0
                    
                    logger.info(
                        f"[Discovery SERP] Result {idx}: {vendor_name} "
                        f"- URL: {vendor_url[:60]} "
                        f"- Price: ₹{scraped_price:.2f} "
                        f"- Status: {status.upper()}"
                    )
                else:
                    logger.info(
                        f"[Discovery SERP] Result {idx}: {vendor_name} "
                        f"- URL: {vendor_url[:60]} "
                        f"- Price: N/A "
                        f"- Status: {status.upper()}"
                    )
                
                discovered_vendors.append(discovery_dict)
                
            except Exception as e:
                logger.warning(f"[Discovery SERP] Error processing result {idx}: {str(e)}")
                continue
        
        logger.info(f"[Discovery SERP] ✓ Discovered {len(discovered_vendors)} alternative vendors via SERP API")
        
        # Summary statistics
        filtered_count = len(results) - len(discovered_vendors)
        logger.info(f"\n[Discovery SERP] {'='*80}")
        logger.info(f"[Discovery SERP] FILTERING SUMMARY:")
        logger.info(f"[Discovery SERP]   Total SERP results received: {len(results)}")
        logger.info(f"[Discovery SERP]   ✓ Passed filters (new vendors): {len(discovered_vendors)}")
        logger.info(f"[Discovery SERP]   ⊘ Filtered (registered vendors): {filtered_count}")
        logger.info(f"[Discovery SERP] {'='*80}\n")
        
    except Exception as e:
        logger.error(f"[Discovery SERP] Critical error in SERP discovery: {str(e)}")
    
    return discovered_vendors


def scrape_vendor_website(product: Product, vendor: Vendor, driver, timeout_seconds: int = 15) -> Optional[float]:
    """Scrape price from vendor website with improved search strategy."""
    try:
        logger.info(f"[Vendor] ===== Scraping {vendor.name} ({vendor.website_url}) =====")
        
        # Navigate to vendor URL
        logger.info(f"[Vendor] Loading website...")
        driver.get(vendor.website_url)
        
        # Wait for page load
        try:
            WebDriverWait(driver, timeout_seconds).until(
                lambda d: d.execute_script("return document.readyState") == "complete"
            )
        except:
            logger.warning(f"[Vendor] Page load timeout after {timeout_seconds}s, continuing anyway...")
        
        # Dismiss overlays/popups
        dismiss_overlays(driver)
        human_delay(0.5, 1.0)
        
        logger.info(f"[Vendor] Page loaded. Title: {driver.title}")
        
        # Strategy: Try product name first
        logger.info(f"[Vendor] SEARCH ATTEMPT 1: Product name '{product.product_name}'")
        search_success = find_and_search(driver, product.product_name, vendor.name)
        
        if search_success:
            logger.info(f"[Vendor] ✓ Product name search succeeded")
            human_delay(1, 2)
        else:
            # Strategy: Try barcode if product name failed
            logger.warning(f"[Vendor] Product name search failed. Trying barcode...")
            logger.info(f"[Vendor] SEARCH ATTEMPT 2: Barcode '{product.barcode}'")
            search_success = find_and_search(driver, product.barcode, vendor.name)
            
            if search_success:
                logger.info(f"[Vendor] ✓ Barcode search succeeded")
                human_delay(1, 2)
            else:
                logger.warning(f"[Vendor] Barcode search also failed. Trying to scrape current page...")
                human_delay(1, 2)
        
        # Dismiss any new overlays that appeared after search
        try:
            dismiss_overlays(driver)
        except:
            pass
        
        # Wait for any price patterns to appear on page
        if search_success:
            try:
                WebDriverWait(driver, 5).until(
                    lambda d: re.search(r"\d+[.,]\d{2}", d.page_source)
                )
                logger.debug(f"[Vendor] Price pattern detected on page")
            except:
                logger.debug(f"[Vendor] No price pattern found within 5s")
        
        # Extract price from current page
        page_source = driver.page_source
        scraped_price = extract_price_from_html(page_source)
        
        if scraped_price:
            logger.info(f"[Vendor] ✓✓✓ PRICE FOUND: {scraped_price} from {vendor.name}")
            return scraped_price
        else:
            logger.warning(f"[Vendor] ✗ No price extracted from {vendor.name}")
            logger.debug(f"[Vendor] Page title: {driver.title}")
            logger.debug(f"[Vendor] Current URL: {driver.current_url}")
            return None

    except Exception as e:
        logger.error(f"[Vendor] ❌ Scraping error for {vendor.name}: {str(e)}")
        return None


class ScraperService:
    """Service for scraping product prices from vendors."""

    @staticmethod
    async def scrape_product(
        db: AsyncSession,
        product_id: int,
        enable_discovery: bool = False,
        headless_mode: bool = True,
        timeout_seconds: int = 15
    ) -> Dict:
        """
        Scrape a product against all registered vendors.
        
        Args:
            db: Database session
            product_id: Product ID to scrape
            enable_discovery: If True, also discover vendors via Google (disabled by default)
            headless_mode: Run browser in headless mode
            timeout_seconds: Timeout for page load
            
        Returns:
            Dictionary with scraping results
        """
        # Fetch product
        result = await db.execute(select(Product).where(Product.id == product_id))
        product = result.scalars().first()
        
        if not product:
            raise HTTPException(
                status_code=status.HTTP_404_NOT_FOUND,
                detail=f"Product with ID {product_id} not found"
            )

        # Fetch all active vendors
        vendor_result = await db.execute(select(Vendor).where(Vendor.is_active == True))
        vendors = vendor_result.scalars().all()
        
        if not vendors:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="No active vendors found"
            )

        results = []
        violation_records = []
        created_violations_set = set()  # Track violations actually added to database
        
        # Collect registered vendor domains for exclusion in discovery
        registered_domains = set()
        for vendor in vendors:
            domain = get_domain_from_url(vendor.website_url)
            registered_domains.add(domain)
        
        try:
            # Run scraping in thread pool to avoid blocking event loop
            scraping_task = asyncio.to_thread(
                ScraperService._run_scraping,
                product,
                vendors,
                headless_mode,
                timeout_seconds
            )
            results, violation_records = await scraping_task
            
            # optionally perform google discovery
            if enable_discovery:
                logger.info(f"[Scraper] Google discovery ENABLED for product {product_id}")
                try:
                    # Build normalized URLs set from vendor results to check against
                    vendor_normalized_urls = set()
                    for result in results:
                        url = result.get("vendor_url", "")
                        normalized = normalize_url_for_comparison(url)
                        vendor_normalized_urls.add(normalized)
                    
                    logger.info(f"[Discovery] Starting with {len(vendor_normalized_urls)} vendor URLs to check against")
                    
                    # Keep discovering until we get enough unique URLs or hit max attempts
                    max_discovery_attempts = 3
                    all_discovered = []
                    attempt = 0
                    
                    while attempt < max_discovery_attempts:
                        attempt += 1
                        logger.info(f"[Discovery] Attempt {attempt}/{max_discovery_attempts}")
                        
                        # run in thread since selenium is synchronous
                        discovered = await asyncio.to_thread(
                            discover_alternative_vendors, 
                            product.barcode, 
                            product.product_name, 
                            float(product.msp), 
                            registered_domains
                        )
                        logger.info(f"[Discovery] Attempt {attempt} completed. Found {len(discovered)} alternative vendors")
                        
                        # Store all discovered vendors directly without duplicate checking
                        all_discovered.extend(discovered)
                        logger.info(f"[Discovery] Added {len(discovered)} vendor URLs to storage")
                        
                        # Stop after max attempts
                        if attempt >= max_discovery_attempts:
                            logger.info(f"[Discovery] Completed {max_discovery_attempts} attempts")
                            break
                    
                    # Extend results with all discovered (both new and from previous attempts)
                    logger.info(f"[Discovery] COMPLETE: Found {len(all_discovered)} total unique discovery URLs")
                    # Mark these as Google discovered for source_type assignment in the results processing loop
                    for item in all_discovered:
                        if "source_type_hint" not in item:
                            item["source_type_hint"] = "discovered"
                    results.extend(all_discovered)
                    
                    # Don't create violations here - they will be created in the unified results processing loop below
                    # This prevents double-creation and ensures consistent source_type assignment
                except Exception as exc:
                    logger.error(f"[Discovery] Exception during discovery: {str(exc)}", exc_info=True)
            else:
                logger.info(f"[Scraper] Google discovery DISABLED for product {product_id}")

            # Remove duplicate results to prevent duplicate violations
            # Key insight: Same vendor can be found via different URLs (registered vendor URL vs Google SERP URL)
            # So we deduplicate based on VENDOR + PRICE, not URL
            # This deduplication removes the same vendor selling at the same price, regardless of URL source
            seen_result_keys = set()
            deduplicated_results = []
            removed_duplicates = []
            
            for result in results:
                url = result.get("vendor_url", "")
                vendor_name = result.get("vendor_name", "Unknown")
                scraped_price = result.get("scraped_price")
                msp = result.get("msp")
                
                # Use vendor-based key: (vendor_name, product_name, msp, scraped_price)
                # This deduplicates the same vendor selling at the same price, regardless of URL source
                # Key: NOT including URL, which may differ between registered vendor site and Google discovery
                dedup_key = (vendor_name, product.product_name, msp, round(scraped_price, 2) if scraped_price else None)
                
                if dedup_key not in seen_result_keys:
                    seen_result_keys.add(dedup_key)
                    deduplicated_results.append(result)
                    logger.info(f"[Dedup] ✅ KEPT: {vendor_name} - {url[:70]} (Product: {product.product_name}, MSP: {msp}, Price: {scraped_price})")
                else:
                    removed_duplicates.append(result)
                    logger.info(f"[Dedup] ❌ REMOVED DUPLICATE: {vendor_name} - {url[:70]} (Product: {product.product_name}, MSP: {msp}, Price: {scraped_price}) - Same vendor already found from another source")
            
            results = deduplicated_results
            logger.info(f"\n[Deduplication Summary]")
            logger.info(f"  Input URLs: {len(results) + len(removed_duplicates)}")
            logger.info(f"  Unique results kept: {len(results)}")
            logger.info(f"  Duplicates removed: {len(removed_duplicates)}")
            logger.info(f"  Vendor scraped: {sum(1 for r in results if r.get('vendor_id'))}")
            logger.info(f"  Discovery found: {sum(1 for r in results if not r.get('vendor_id'))}\n")


            # Save all results to database
            # Track violations we're about to create to prevent in-session duplicates
            created_violations = set()  # Track (url, price) tuples of violations we create
            for result in results:
                vendor_id = result.get("vendor_id")  # may be None for discovery, or a string UUID
                vendor_name = result["vendor_name"]
                vendor_url = result["vendor_url"]
                scraped_price = result["scraped_price"]
                msp = result["msp"]
                vendor_status = result["status"]
                price_diff = result.get("price_difference")
                perc_diff = result.get("percentage_difference")
                
                # Determine source type based on vendor_id and source_type_hint
                if vendor_id:
                    source = "registered"
                elif result.get("source_type_hint") == "serp_api_shopping":
                    source = "serp_api_shopping"
                else:
                    source = "discovered"
                    
                domain = get_domain_from_url(vendor_url)
                
                # Convert vendor_id from string to UUID if it's a registered vendor
                vendor_id_uuid = None
                if vendor_id:
                    try:
                        vendor_id_uuid = UUID(vendor_id) if isinstance(vendor_id, str) else vendor_id
                    except (ValueError, TypeError):
                        logger.warning(f"Could not convert vendor_id to UUID: {vendor_id}")
                        vendor_id_uuid = None

                # Create scraping result record
                scraping_result = ScrapingResult(
                    product_id=product.id,
                    vendor_id=vendor_id_uuid,
                    product_name=product.product_name,
                    barcode=product.barcode,
                    reference_id=str(product.reference_id),
                    vendor_name=vendor_name,
                    vendor_url=vendor_url,
                    domain_name=domain,
                    msp=msp,
                    scraped_price=scraped_price,
                    price_difference=price_diff,
                    percentage_difference=perc_diff,
                    compliance_status=vendor_status,
                    source_type=source,
                )

                db.add(scraping_result)

                # Create violation record if applicable (with duplicate prevention)
                if vendor_status == "violation":
                    # Use vendor-based key: (vendor_name, product_name, msp, scraped_price)
                    # This matches the deduplication logic in the results processing above
                    # Same vendor selling at same price is considered a duplicate, regardless of URL
                    violation_key = (vendor_name, product.product_name, msp, round(scraped_price, 2))
                    
                    # Check for duplicates in current session first (in-memory)
                    if violation_key in created_violations:
                        logger.info(f"⚠️ IN-SESSION DUPLICATE VIOLATION PREVENTED: {vendor_name} - {product.product_name} at {vendor_url[:80]}")
                    else:
                        # Check for duplicates in database
                        is_duplicate = await check_duplicate_violation(
                            db, vendor_url, scraped_price, product.barcode, product.product_name, msp, vendor_name=vendor_name
                        )
                        
                        if not is_duplicate:
                            # Determine marketplace: domain for registered, vendor_name for discovered
                            if vendor_id_uuid:
                                marketplace = get_domain_from_url(vendor_url)
                            else:
                                marketplace = vendor_name
                            
                            violation = Violation(
                                vendor_id=vendor_id_uuid,
                                vendor_name=vendor_name,
                                product_name=product.product_name,
                                msp=msp,
                                scraped_price=scraped_price,
                                price_difference=price_diff,
                                percentage_difference=perc_diff,
                                marketplace=marketplace,
                                violation_date=datetime.utcnow(),
                                url=vendor_url,
                                barcode_number=product.barcode,
                                reference_id=str(product.reference_id),
                                source_type=source,
                            )
                            db.add(violation)
                            created_violations.add(violation_key)  # Track this violation
                            # Also track that this violation was actually created
                            created_violations_set.add(violation_key)
                            logger.warning(f"🚨 VIOLATION SAVED: {vendor_name} - {product.product_name} @ ₹{scraped_price} (MSP: ₹{msp})")
                        else:
                            logger.info(f"⚠️ DATABASE DUPLICATE VIOLATION PREVENTED: {vendor_name} - {product.product_name} @ ₹{scraped_price}")

            # Update last_scraped_date for product
            product.last_scraped_date = datetime.utcnow()
            execution_time_str = datetime.utcnow().isoformat()
            product.last_execution_time = execution_time_str

            # Commit all changes
            await db.commit()
            
            # Refresh product to ensure changes are persisted
            await db.refresh(product)
            
            # Build filtered violations response - only include violations actually created
            # Map violations by vendor-based key to match against created_violations_set
            filtered_violations = []
            for v in violation_records:
                vendor_url = v.get("vendor_url", "")
                vendor_name = v.get("vendor_name", "Unknown")
                scraped_price = float(v.get("scraped_price", 0))
                msp = v.get("msp", product.msp)
                
                # Create violation key same way as when creating violations
                # Vendor-based key: (vendor_name, product_name, msp, scraped_price)
                violation_key = (vendor_name, product.product_name, msp, round(scraped_price, 2))
                
                # Only include violations that were actually saved to database
                if violation_key in created_violations_set:
                    filtered_violations.append(v)
                    logger.info(f"✅ [Response] Including violation: {v.get('vendor_name')} - {vendor_url[:60]}")
                else:
                    logger.info(f"⏭️  [Response] Skipping violation (duplicate/not saved): {v.get('vendor_name')} - {vendor_url[:60]}")
            
            # Build results response - all deduplicated URLs (both vendor and discovery)
            # Results already deduplicated earlier, so include all
            logger.info(f"📊 [Response] Final Results: {len(results)} unique URLs, {len(filtered_violations)} violations created")
            
            resp = {
                "product_id": product_id,
                "product_name": product.product_name,
                "msp": float(product.msp),
                "scraped_count": len(results),  # All unique URLs (vendors + unique discoveries)
                "violation_count": len(created_violations_set),  # Only violations actually saved
                "results": results,  # All unique URL results (no duplicates)
                "violations": filtered_violations,  # Only violations that were actually created
                "execution_time": execution_time_str,
            }
            return resp

        except Exception as e:
            logger.error(f"❌ Fatal scraping error: {str(e)}")
            await db.rollback()
            raise HTTPException(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                detail=f"Scraping failed: {str(e)}"
            )

    @staticmethod
    def _run_scraping(product: Product, vendors: List[Vendor], headless_mode: bool, timeout_seconds: int) -> Tuple[List[Dict], List[Dict]]:
        """Run the actual scraping in a separate thread."""
        results = []
        violation_records = []
        
        driver = None
        try:
            driver = initialize_selenium_driver(headless=headless_mode)
            
            logger.info(f"\n{'='*80}")
            logger.info(f"[Scraper] Starting scrape for PRODUCT: {product.product_name} (Barcode: {product.barcode})")
            logger.info(f"[Scraper] MSP (Minimum Selling Price): {product.msp}")
            logger.info(f"[Scraper] Total vendors to check: {len(vendors)}")
            logger.info(f"{'='*80}\n")
            
            for idx, vendor in enumerate(vendors, 1):
                if not vendor.website_url:
                    logger.warning(f"[Scraper] [{idx}/{len(vendors)}] Vendor '{vendor.name}' has no website URL, skipping...")
                    continue

                logger.info(f"\n[Scraper] [{idx}/{len(vendors)}] Processing vendor: {vendor.name}")
                
                # Scrape vendor with search and price extraction
                scraped_price = scrape_vendor_website(product, vendor, driver, timeout_seconds)
                
                msp = float(product.msp)
                vendor_status = determine_compliance_status(msp, scraped_price)
                
                price_diff = None
                perc_diff = None
                
                if scraped_price:
                    price_diff, perc_diff = calculate_price_difference(msp, scraped_price)

                results.append({
                    "vendor_id": str(vendor.id),
                    "vendor_name": vendor.name,
                    "vendor_url": vendor.website_url,
                    "scraped_price": scraped_price,
                    "msp": msp,
                    "status": vendor_status,
                    "price_difference": price_diff,
                    "percentage_difference": perc_diff
                })
                
                # Log the result
                if scraped_price:
                    if vendor_status == "violation":
                        logger.warning(f"[Scraper] [{idx}/{len(vendors)}] 🚨 VIOLATION: {vendor.name} - Price {scraped_price} < MSP {msp} (Diff: {price_diff})")
                    elif vendor_status == "complain":
                        logger.info(f"[Scraper] [{idx}/{len(vendors)}] ⚠️  COMPLAIN: {vendor.name} - Price {scraped_price} > MSP {msp} (Diff: +{price_diff})")
                    else:
                        logger.info(f"[Scraper] [{idx}/{len(vendors)}] ✅ COMPLIANT: {vendor.name} - Price {scraped_price} = MSP {msp}")
                else:
                    logger.warning(f"[Scraper] [{idx}/{len(vendors)}] ❓ UNKNOWN: {vendor.name} - No price found")

                # Track violations
                if vendor_status == "violation":
                    violation_records.append({
                        "vendor_id": str(vendor.id),
                        "vendor_name": vendor.name,
                        "vendor_url": vendor.website_url,
                        "product_name": product.product_name,
                        "scraped_price": scraped_price,
                        "msp": msp,
                        "price_difference": price_diff
                    })

                human_delay(1.5, 2.5)

            logger.info(f"\n{'='*80}")
            logger.info(f"[Scraper] ✓ Registered vendors scraping complete")
            logger.info(f"[Scraper] Results: {len(results)} vendors checked, {len(violation_records)} violations found")
            logger.info(f"{'='*80}\n")
            
            return results, violation_records

        except Exception as e:
            logger.error(f"❌ Fatal scraping error: {str(e)}")
            raise
        finally:
            if driver:
                try:
                    cleanup_chrome_session(driver)
                    driver.quit()
                    logger.info("[Scraper] ✓ WebDriver closed")
                except:
                    pass

    @staticmethod
    async def scrape_product_serp(
        db: AsyncSession,
        product_id: int,
        enable_discovery: bool = False,
        headless_mode: bool = True,
        timeout_seconds: int = 15
    ) -> Dict:
        """
        Scrape a product against all registered vendors using SERP API for discovery.
        Same functionality as scrape_product, but uses SERP API Google Shopping Light 
        Engine instead of Tavily for discovering alternative vendors.
        
        Args:
            db: Database session
            product_id: Product ID to scrape
            enable_discovery: If True, also discover vendors via SERP API Shopping (disabled by default)
            headless_mode: Run browser in headless mode
            timeout_seconds: Timeout for page load
            
        Returns:
            Dictionary with scraping results
        """
        # Fetch product
        result = await db.execute(select(Product).where(Product.id == product_id))
        product = result.scalars().first()
        
        if not product:
            raise HTTPException(
                status_code=status.HTTP_404_NOT_FOUND,
                detail=f"Product with ID {product_id} not found"
            )

        # Fetch all active vendors
        vendor_result = await db.execute(select(Vendor).where(Vendor.is_active == True))
        vendors = vendor_result.scalars().all()
        
        if not vendors:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="No active vendors found"
            )

        results = []
        violation_records = []
        created_violations_set = set()
        
        # Collect registered vendor domains for exclusion in discovery
        registered_domains = set()
        for vendor in vendors:
            domain = get_domain_from_url(vendor.website_url)
            registered_domains.add(domain)
        
        try:
            # Run scraping in thread pool to avoid blocking event loop
            scraping_task = asyncio.to_thread(
                ScraperService._run_scraping,
                product,
                vendors,
                headless_mode,
                timeout_seconds
            )
            results, violation_records = await scraping_task
            
            # Process all registered and discovered violations to database
            # This ensures all violations are persisted before returning
            for result in results:
                if result.get("status") == "violation":
                    vendor_id = result.get("vendor_id")
                    vendor_name = result["vendor_name"]
                    vendor_url = result["vendor_url"]
                    scraped_price = result["scraped_price"]
                    msp = float(result["msp"])
                    price_diff = result.get("price_difference")
                    perc_diff = result.get("percentage_difference")
                    
                    # Determine source type
                    if vendor_id:
                        source = "registered"
                        vendor_id_uuid = UUID(vendor_id) if isinstance(vendor_id, str) else vendor_id
                    else:
                        source = "discovered"
                        vendor_id_uuid = None
                    
                    # Check for duplicates and create violation
                    is_duplicate = await check_duplicate_violation(
                        db, vendor_url, scraped_price, product.barcode, product.product_name, msp, vendor_name=vendor_name
                    )
                    
                    if not is_duplicate:
                        # Determine marketplace: domain for registered, vendor_name for discovered
                        if vendor_id_uuid:
                            marketplace = get_domain_from_url(vendor_url)
                        else:
                            marketplace = vendor_name
                        
                        violation = Violation(
                            vendor_id=vendor_id_uuid,
                            vendor_name=vendor_name,
                            product_name=product.product_name,
                            msp=msp,
                            scraped_price=scraped_price,
                            price_difference=price_diff,
                            percentage_difference=perc_diff,
                            marketplace=marketplace,
                            violation_date=datetime.utcnow(),
                            url=vendor_url,
                            barcode_number=product.barcode,
                            reference_id=str(product.reference_id),
                            source_type=source,
                        )
                        db.add(violation)
                        logger.warning(f"🚨 VIOLATION SAVED: {vendor_name} - {product.product_name} @ {vendor_url[:60]} (Source: {source})")
            
            # optionally perform SERP API discovery for alternative vendors
            if enable_discovery:
                logger.info(f"[Scraper SERP] SERP API discovery ENABLED for product {product_id}")
                try:
                    # Perform SERP API discovery with multiple attempts
                    max_discovery_attempts = 3
                    all_discovered = []
                    attempt = 0
                    
                    while attempt < max_discovery_attempts:
                        attempt += 1
                        logger.info(f"[Discovery SERP] Attempt {attempt}/{max_discovery_attempts}")
                        
                        # Run SERP discovery in thread
                        discovered = await asyncio.to_thread(
                            discover_alternative_vendors_serp, 
                            product.barcode, 
                            product.product_name, 
                            float(product.msp), 
                            registered_domains
                        )
                        logger.info(f"[Discovery SERP] Attempt {attempt} completed. Found {len(discovered)} alternative vendors")
                        
                        # Store all discovered vendors directly without duplicate checking
                        all_discovered.extend(discovered)
                        logger.info(f"[Discovery SERP] Added {len(discovered)} vendor URLs to storage")
                        
                        # Stop after attempts
                        if attempt >= max_discovery_attempts:
                            logger.info(f"[Discovery SERP] Completed {max_discovery_attempts} attempts")
                            break
                    
                    # Deduplicate discovered vendors across multiple attempts
                    # Use vendor-based key (no URL): vendor_name + product_name + MSP + price
                    # This prevents same vendor found in multiple discovery attempts from creating duplicates
                    seen_discovery_keys = set()
                    deduplicated_discovered = []
                    removed_discovery_duplicates = []
                    
                    for item in all_discovered:
                        url = item.get("vendor_url", "")
                        vendor_name = item.get("vendor_name", "Unknown")
                        scraped_price = item.get("scraped_price")
                        msp = item.get("msp")
                        
                        # Use vendor-based key (NO URL): (vendor_name, product_name, msp, scraped_price)
                        # Same vendor at same price is duplicate, regardless of URL source
                        dedup_key = (vendor_name, product.product_name, msp, round(scraped_price, 2) if scraped_price else None)
                        
                        if dedup_key not in seen_discovery_keys:
                            seen_discovery_keys.add(dedup_key)
                            deduplicated_discovered.append(item)
                            logger.info(f"[Discovery Dedup] ✅ KEPT: {vendor_name} - {url[:70]} (Product: {product.product_name}, MSP: {msp}, Price: {scraped_price})")
                        else:
                            removed_discovery_duplicates.append(item)
                            logger.info(f"[Discovery Dedup] ❌ REMOVED DUPLICATE: {vendor_name} - {url[:70]} (Product: {product.product_name}, MSP: {msp}, Price: {scraped_price}) - Same vendor already found")
                    
                    all_discovered = deduplicated_discovered
                    logger.info(f"[Discovery Dedup] Before: {len(all_discovered) + len(removed_discovery_duplicates)} results, After: {len(all_discovered)} unique results ({len(removed_discovery_duplicates)} duplicates removed)")
                    
                    # Process SERP discovered violations to database with duplicate check
                    for item in all_discovered:
                        if item.get("status") == "violation":
                            vendor_name_serp = item.get("vendor_name", "Unknown")
                            vendor_url_serp = item.get("vendor_url", "")
                            scraped_price_serp = item.get("scraped_price")
                            msp_serp = float(item.get("msp", product.msp))
                            
                            # Check database for duplicates before creating violation
                            is_duplicate_serp = await check_duplicate_violation(
                                db, vendor_url_serp, scraped_price_serp, product.barcode, product.product_name, msp_serp, vendor_name=vendor_name_serp
                            )
                            
                            if not is_duplicate_serp:
                                violation = Violation(
                                    vendor_id=None,  # SERP discovered vendors don't have vendor_id
                                    vendor_name=vendor_name_serp,
                                    product_name=product.product_name,
                                    msp=msp_serp,
                                    scraped_price=item["scraped_price"],
                                    price_difference=item.get("price_difference"),
                                    percentage_difference=item.get("percentage_difference"),
                                    marketplace=vendor_name_serp,  # For SERP, marketplace = vendor_name
                                    url=item.get("vendor_url", ""),
                                    barcode_number=product.barcode,
                                    reference_id=str(product.reference_id),
                                    source_type="serp_api_shopping",
                                    violation_date=datetime.utcnow()
                                )
                                db.add(violation)
                                created_violations_set.add((vendor_name_serp, product.product_name, msp_serp, round(scraped_price_serp, 2)))
                                
                                # Also add to violation_records for response inclusion
                                violation_records.append({
                                    "vendor_name": vendor_name_serp,
                                    "vendor_url": vendor_url_serp,
                                    "product_name": product.product_name,
                                    "scraped_price": scraped_price_serp,
                                    "msp": msp_serp,
                                    "price_difference": item.get("price_difference")
                                })
                                
                                logger.warning(f"🔍 SERP VIOLATION SAVED: {vendor_name_serp} - {product.product_name} @ {scraped_price_serp}")
                            else:
                                logger.info(f"⚠️ SERP DUPLICATE PREVENTED: {vendor_name_serp} - {product.product_name} @ {scraped_price_serp}")
                    
                    # Extend results with deduplicated discovered vendors
                    logger.info(f"[Discovery SERP] COMPLETE: Found {len(all_discovered)} unique discovery URLs after deduplication")
                    results.extend(all_discovered)
                    logger.info(f"[Discovery SERP] Results extended with {len(all_discovered)} discovery vendors")
                    
                except Exception as e:
                    logger.error(f"[Discovery SERP] Error during SERP discovery: {str(e)}")
            
            # Update product execution timestamps
            product.last_scraped_date = datetime.utcnow()
            execution_time_str = datetime.utcnow().isoformat()
            product.last_execution_time = execution_time_str
            
            # Commit all violations (registered + discovered + SERP) to database
            await db.commit()
            
            # Refresh product to ensure changes are persisted
            await db.refresh(product)
            
            logger.info(f"\n{'='*80}")
            logger.info(f"[Scraper SERP] ✓ Scraping complete for product {product_id}")
            logger.info(f"[Scraper SERP] Total Results: {len(results)}")
            logger.info(f"[Scraper SERP] Total Violations: {len(violation_records)}")
            logger.info(f"{'='*80}\n")
            
            return {
                "product_id": product_id,
                "product_name": product.product_name,
                "barcode": product.barcode,
                "msp": float(product.msp),
                "results": results,
                "violations": violation_records,
                "total_results": len(results),
                "last_execution_time": execution_time_str,
                "total_violations": len(violation_records),
                "discovery_enabled": enable_discovery,
                "source_type": "browser_with_serp_api_discovery"
            }
            
        except Exception as e:
            logger.error(f"❌ Fatal scraping error: {str(e)}")
            await db.rollback()
            raise HTTPException(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                detail=f"Scraping failed: {str(e)}"
            )
