#!/usr/bin/env python3



"""

Refresh `<section id="main-results-section">` product grids in PHP pages.


1. Query Google Images (google.es with `tbm=isch`) using `stem+...+site:amazon.es`.

2. Parse amazon.es DP links when embedded in HTML.

3. Fallback to Amazon.es SERP scraping (`https://www.amazon.es/s`) when ASINs are unavailable.


Preserves PHP placeholders `'.$tagafiliadoEs.'` and `'.$rutaimagen.'`.

Appends successfully written filenames to `refresh-products-logs.txt` (see --progress-log).

On startup, skips any `.php` name already recorded so the next alphabetical files continue.

Adds a random 200-500 ms pause after each Google/Amazon HTTP response.

CLI logging uses English.
"""



from __future__ import annotations



import argparse

import html

import logging

import random

import re

import sys

import time

from dataclasses import dataclass

from pathlib import Path

from urllib.parse import parse_qs, unquote, urlparse, urlunparse



import requests

from bs4 import BeautifulSoup



LOGGER = logging.getLogger("refresh_amazon_products")



SECTION_OPEN_RE = re.compile(

    r'<section\s+id="main-results-section"\s*>',

    re.IGNORECASE,

)

ASIN_RE = re.compile(

    r"/(?:dp|gp/product)/([A-Z0-9]{10})(?:[/?#]|$)",

    re.IGNORECASE,

)

USER_AGENT = (

    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) "

    "Gecko/20100101 Firefox/55.0"

)


AMAZON_ACCEPT_HEADER = (

    "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"

)



SKIP_FILENAMES = frozenset({"index.php", "404.php"})
PROGRESS_LOG_FILE_NAME = "refresh-products-logs.txt"
REQUEST_JITTER_MIN_SEC = 0.2
REQUEST_JITTER_MAX_SEC = 0.5


def inter_request_pause() -> None:
    """Random delay between HTTP calls to reduce throttling."""

    delay = random.uniform(REQUEST_JITTER_MIN_SEC, REQUEST_JITTER_MAX_SEC)
    time.sleep(delay)


def configure_http_session_headers(session: requests.Session) -> None:
    """Browser-like defaults so Amazon.es SERP returns full HTML where possible."""

    session.headers.update(
        {
            "User-Agent": USER_AGENT,
            "Accept": AMAZON_ACCEPT_HEADER,
            "Accept-Language": "es-ES,es;q=0.9,en;q=0.5",
            "Upgrade-Insecure-Requests": "1",
        }
    )


def warmup_amazon_homepage(
    session: requests.Session,
    timeout: float,
    retries: int,
) -> None:
    """Prime cookies/session against amazon.es once before SERP scraping."""

    last_error: str | None = None

    for attempt in range(retries + 1):

        try:

            response = session.get("https://www.amazon.es/", timeout=timeout)

            inter_request_pause()

            if 200 <= response.status_code < 400:

                LOGGER.info("Amazon.es homepage warmup succeeded (HTTP %s)", response.status_code)

                return

            last_error = f"HTTP {response.status_code}"

            time.sleep(1.2 * (attempt + 1))

        except requests.RequestException as exc:

            last_error = str(exc)

            inter_request_pause()

            time.sleep(1.2 * (attempt + 1))

    LOGGER.warning("Amazon.es homepage warmup failed: %s", last_error)


def amazon_serp_looks_blocked(html_blob: str) -> bool:
    """Heuristic for anti-bot / captcha bodies that are not normal SERPs."""

    lower = html_blob.lower()

    needles = (
        "api-services-support@amazon",
        "enter the characters you see below",
        "robot check",
        "sorry, we just need to make sure you're not a robot",
        "captchatable",
        "glimpseamazon",
    )

    return any(token in lower for token in needles)


def load_progress_log(progress_path: Path) -> set[str]:
    """Return basenames (.php) that were already persisted in a prior run."""

    if not progress_path.is_file():

        return set()

    processed: set[str] = set()

    try:

        text = progress_path.read_text(encoding="utf-8", errors="replace")

    except OSError as exc:

        LOGGER.warning("Cannot read progress log %s: %s", progress_path, exc)

        return processed

    for line in text.splitlines():

        stripped = line.strip()

        if not stripped or stripped.startswith("#"):

            continue

        if stripped.endswith(".php"):

            processed.add(stripped)

    return processed


def append_progress_entry(progress_path: Path, basename: str) -> None:
    """Append one completed filename after a successful disk write."""

    progress_path.parent.mkdir(parents=True, exist_ok=True)

    with progress_path.open("a", encoding="utf-8", newline="\n") as handle:

        handle.write(basename.strip() + "\n")





@dataclass(frozen=True)

class AmazonHit:

    """Matched Amazon.es product extracted from Google's SERP."""



    product_url_without_affiliate: str

    asin: str

    title: str

    image_url: str





def strip_google_redirect(href: str) -> str:

    parsed = urlparse(href)

    host = parsed.netloc.lower()

    if ("google.es" == host or host.endswith(".google.es")) and parsed.path.startswith("/url"):

        query = parse_qs(parsed.query)

        if query.get("q"):

            return unquote(query["q"][0])

    return href





def canonical_dp_url(candidate: str) -> tuple[str | None, str | None]:
    raw = strip_google_redirect(candidate.strip().replace("&amp;", "&"))
    if raw.startswith("//"):
        raw = "https:" + raw
    if not raw.lower().startswith("http"):
        return None, None

    decoded = raw.replace("\\/", "/")
    parsed = urlparse(decoded)
    if "amazon.es" not in parsed.netloc.lower():
        return None, None

    path = parsed.path or ""
    matched = ASIN_RE.search(path)
    if not matched:
        matched = ASIN_RE.search(parsed.geturl())
    if not matched:
        return None, None

    asin_val = matched.group(1).upper()
    slug = ""

    if "/dp/" in path:
        slug = path.split("/dp/")[0].rstrip("/")
    elif "/gp/product/" in path:
        slug = path.split("/gp/product/")[0].rstrip("/")

    rel_path = f"{slug}/dp/{asin_val}" if slug else f"/dp/{asin_val}"
    if not rel_path.startswith("/"):
        rel_path = "/" + rel_path

    canon_url = urlunparse(("https", "www.amazon.es", rel_path, "", "", ""))
    return canon_url, asin_val





def amazon_hits_regex_fallback(html_blob: str, limit: int) -> list[AmazonHit]:
    """
    Recover ASINs from raw HTML when BeautifulSoup misses search-result rows.

    Preserves left-to-right order and skips duplicates.

    """

    collected: list[AmazonHit] = []

    seen: set[str] = set()

    for match in re.finditer(r'data-asin="([A-Z0-9]{10})"', html_blob):

        asin = match.group(1).upper()

        if asin in seen:

            continue

        seen.add(asin)

        product_url, _ = canonical_dp_url(f"https://www.amazon.es/dp/{asin}")

        if not product_url:

            continue

        thumb_guess = (

            f"https://m.media-amazon.com/images/P/{asin}.01._SCLZZZZZZZ_.jpg"

        )

        collected.append(

            AmazonHit(

                product_url_without_affiliate=product_url,

                asin=asin,

                title=f"Amazon product {asin}",

                image_url=thumb_guess,

            )

        )

        if len(collected) >= limit:

            break

    return collected





def extract_google_thumb_pairs(html_blob: str) -> list[tuple[str, str]]:
    """Pair thumbnail + landing page URLs emitted by Google's JSON blobs."""

    pairs: list[tuple[str, str]] = []
    for amazon_match in re.finditer(
        r'"(?:ru|isu)":"(https:\\/\\/www\\.amazon\\.es[^\"]+)"',
        html_blob,
    ):
        segment = html_blob[max(0, amazon_match.start() - 2400) : amazon_match.start()]
        thumbs = re.findall(r'"ou":"(https:\\/\\/[^\"]+)"', segment)
        if not thumbs:
            continue
        encoded_thumb = thumbs[-1]
        encoded_landing = amazon_match.group(1)
        thumb_plain = encoded_thumb.replace("\\\\/", "/").replace("\\/", "/")
        landing_plain = encoded_landing.replace("\\\\/", "/").replace("\\/", "/")
        pairs.append((thumb_plain, strip_google_redirect(landing_plain)))
    return pairs





def normalize_title(text: str, asin: str) -> str:

    stripped = (

        text.replace("\r", " ")

        .replace("\n", " ")

        .strip()

    )

    if not stripped:

        stripped = f"Amazon product {asin}"

    if len(stripped) > 200:

        stripped = stripped[:197] + "..."

    return stripped





def extract_amazon_hits(html_blob: str, limit: int) -> list[AmazonHit]:

    """Collect up to `limit` unique ASIN-backed hits from Google's HTML."""

    merged: dict[str, AmazonHit] = {}

    order: list[str] = []

    def preferred_title(primary: str, secondary: str) -> str:
        placeholder = lambda value: value.startswith("Amazon product ")
        if placeholder(secondary) and not placeholder(primary):
            return primary
        if placeholder(primary) and not placeholder(secondary):
            return secondary
        return primary if len(primary) >= len(secondary) else secondary

    def push(hit: AmazonHit | None) -> None:

        nonlocal merged, order

        if hit is None:

            return

        if hit.asin in merged:

            prior = merged[hit.asin]

            merged[hit.asin] = AmazonHit(

                product_url_without_affiliate=prior.product_url_without_affiliate,

                asin=prior.asin,

                title=preferred_title(prior.title, hit.title),

                image_url=prior.image_url or hit.image_url,

            )

            return

        if len(order) >= limit:

            return

        merged[hit.asin] = hit

        order.append(hit.asin)



    soup = BeautifulSoup(html_blob, "html.parser")

    for anchor in soup.find_all("a", href=True):

        if len(order) >= limit:

            break

        href_raw = anchor["href"]

        target = strip_google_redirect(href_raw)

        canon, maybe_asin = canonical_dp_url(target)

        if not canon or not maybe_asin:

            continue

        label = (

            anchor.get("title")

            or anchor.get_text(strip=True, separator=" ").strip()

        )

        img_node = anchor.find("img")

        img_src = img_node.get("src") if img_node else ""

        push(

            AmazonHit(

                product_url_without_affiliate=canon,

                asin=maybe_asin,

                title=normalize_title(label or "", maybe_asin),

                image_url=img_src.strip() if img_src else "",

            )

        )



    amazon_url_re = re.compile(

        r"https:\\/\\/www\\.amazon\\.es(?:\\/[^\\\"\\s]+)*\\/(?:dp|gp\\/product)\\/"

        r"([A-Z0-9]{10})(?:[^\s\\\"]*)",

        flags=re.IGNORECASE,

    )

    for occurrence in amazon_url_re.finditer(html_blob):

        if len(order) >= limit:

            break

        raw = (

            occurrence.group(0)

            .replace("\\/", "/")

            .replace('\\\\"', '"')

        )

        canon, maybe_asin = canonical_dp_url(raw)

        if not canon:

            continue

        push(

            AmazonHit(

                product_url_without_affiliate=canon,

                asin=maybe_asin,

                title=f"Amazon product {maybe_asin}",

                image_url="",

            )

        )



    plain_pat = re.compile(

        r"https://www\.amazon\.es(?:\/[^?\s\"'<>]*)?/(?:dp|gp/product)/"

        r"([A-Z0-9]{10})(?:[^\s\"'<>]*)",

        flags=re.IGNORECASE,

    )

    for occurrence in plain_pat.finditer(html_blob):

        if len(order) >= limit:

            break

        canon, maybe_asin = canonical_dp_url(occurrence.group(0))

        if not canon:

            continue

        push(

            AmazonHit(

                product_url_without_affiliate=canon,

                asin=maybe_asin,

                title=f"Amazon product {maybe_asin}",

                image_url="",

            )

        )



    for thumb, landing in extract_google_thumb_pairs(html_blob):

        if len(order) >= limit:

            break

        canon, maybe_asin = canonical_dp_url(landing)

        if not canon:

            continue

        push(

            AmazonHit(

                product_url_without_affiliate=canon,

                asin=maybe_asin,

                title=f"Amazon product {maybe_asin}",

                image_url=thumb,

            )

        )



    return [merged[asin] for asin in order[:limit]]





def count_x2(section_inner: str) -> int:

    return len(

        re.findall(

            r'<div\s+class\s*=\s*["\']x2["\']',

            section_inner,

            flags=re.IGNORECASE,

        )

    )





def locate_main_section_span(text: str) -> tuple[int, int] | None:

    match = SECTION_OPEN_RE.search(text)

    if not match:

        return None

    start_inner = match.end()

    close_idx = text.lower().find("</section>", start_inner)

    if close_idx == -1:

        return None

    return start_inner, close_idx



# Remove C0 controls and DEL that break HTML/XML or PHP source readability.
_CONTROL_AND_DISALLOWED_RE = re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]")


def sanitize_plaintext(value: str) -> str:

    if not value:

        return ""

    return _CONTROL_AND_DISALLOWED_RE.sub("", value)



def escape_html_double_quoted_attr(value: str) -> str:

    """Safe for title=\"...\" and alt=\"...\" inside PHP-embedded HTML."""

    cleaned = sanitize_plaintext(value)

    escaped = html.escape(cleaned, quote=True)

    escaped = escaped.replace("'", "&#39;")

    escaped = escaped.replace("`", "&#96;")

    return escaped



def escape_url_double_quoted_attr(url: str) -> str:

    """

    Safe for href/src=\"...\": HTML-escape &, <, >; percent-encode quotes;

    strip line breaks that would split the attribute.

    """

    cleaned = sanitize_plaintext(url.strip())

    cleaned = cleaned.replace("\r", "").replace("\n", "").replace("\t", "")

    escaped = html.escape(cleaned, quote=False)

    escaped = escaped.replace('"', "%22")

    escaped = escaped.replace("'", "%27")

    escaped = escaped.replace("`", "%60")

    escaped = escaped.replace("\\", "%5C")

    return escaped



def escape_html_element_text(value: str) -> str:

    """Safe for <p> ... </p> embedded inside PHP single-quoted strings."""

    cleaned = sanitize_plaintext(value)

    collapsed = " ".join(cleaned.split())

    # quote=True emits &#x27; for apostrophes so filenames like meta_parrafo='...'

    # are not broken by titles such as Readyy'y.

    return html.escape(collapsed, quote=True)



def escape_attr(value: str) -> str:

    """Backward-compatible alias for legacy callers (attribute context)."""

    return escape_html_double_quoted_attr(value)





def php_single_quote(value: str) -> str:

    return value.replace("\\", "\\\\").replace("'", "\\'")





def render_card(hit: AmazonHit) -> str:

    title_attr = escape_html_double_quoted_attr(hit.title)

    parrafo_text = escape_html_element_text(hit.title)

    img_src = hit.image_url or (

        f"https://m.media-amazon.com/images/P/{hit.asin}.01._SCLZZZZZZZ_.jpg"

    )

    img_attr = escape_url_double_quoted_attr(img_src)

    base = hit.product_url_without_affiliate

    base_attr = escape_url_double_quoted_attr(base)



    lines = [

        '<div class="x2">',

        (

            f'              <a class="img-peq" href="{base_attr}'

            f'?\'.$tagafiliadoEs.\'" title="{title_attr}" target="_blank" '

            f'rel="nofollow external"><img alt="{title_attr}" src="{img_attr}"/></a>'

        ),

        (

            f'\t\t      <a class="boton" rel="external nofollow" href="{base_attr}'

            f'?\'.$tagafiliadoEs.\'" target="_blank"><img src="\'.$rutaimagen.\'"/></a> '

            f'<a class="boton-segundo" rel="external nofollow" target="_blank" '

            f'href="{base_attr}?\'.$tagafiliadoEs.\'">Comprar</a>'

        ),

        "     <div class=\"estrellitas\">\t\t\t ",

        "\t\t<div class=\"redondos\">",

        "\t\t\t  <span class=\"compartir\">Compartir:</span>",

        (

            f'\t\t\t  <a href="whatsapp://send?text={base_attr}'

            f'?\'.$tagafiliadoEs.\'" class="compartir-f" target="_blank">'

            f'<img src="img/icono-whatsapp.gif"/></a>'

        ),

        (

            f'\t\t\t  <a href="http://www.twitter.com/share?url={base_attr}'

            f'?\'.$tagafiliadoEs.\'" class="compartir-f" target="_blank">'

            f'<img src="img/icono-twitter.gif"/></a>'

        ),

        (

            f'\t\t\t  <a href="https://www.facebook.com/sharer.php?u={base_attr}'

            f'?\'.$tagafiliadoEs.\'" class="compartir-f" target="_blank">'

            f'<img src="img/icono-facebook.gif"/></a>'

        ),

        "\t\t</div>",

        "     </div>\t\t\t   ",

        f'     <p class="parrafitos">{parrafo_text}</p>',

        "</div>",

    ]

    return "\n".join(lines)





def render_section_inner(hits: list[AmazonHit], slot_count: int) -> str:

    usable = min(slot_count, len(hits))

    blocks = [render_card(hits[idx]) for idx in range(usable)]

    return "\n".join(blocks) + "\n"





def replace_section_content(source: str, new_inner: str) -> str:

    span = locate_main_section_span(source)

    if span is None:

        raise ValueError("main-results-section not found")

    start_inner, end_inner = span

    return source[:start_inner] + "\n" + new_inner + source[end_inner:]





def patch_imagen_relacionada(source: str, image_url: str) -> str:

    replacement = php_single_quote(image_url)

    pattern = re.compile(

        r"(\$ImagenRelacionada\s*=\s*')([^']*)(';)",

        flags=re.MULTILINE,

    )

    new_source, count = pattern.subn(rf"\g<1>{replacement}\g<3>", source, count=1)

    if count == 0:

        LOGGER.warning("Could not update $ImagenRelacionada (pattern not found)")

    return new_source





def fetch_google_isch(

    session: requests.Session,

    keyword_stem: str,

    timeout: float,

    retries: int,

) -> str | None:

    # Match legacy query shape: aspirador+bomann+site:amazon.es (see project spec).

    query_param = f"{keyword_stem.replace('-', '+')}+site:amazon.es"

    params = {"tbm": "isch", "q": query_param, "source": "lnms"}

    last_error: str | None = None

    for attempt in range(retries + 1):

        try:

            response = session.get(

                "http://www.google.es/search",

                params=params,

                timeout=timeout,

            )

            inter_request_pause()

            if response.status_code != 200:

                last_error = f"HTTP {response.status_code}"

                time.sleep(1.5 * (attempt + 1))

                continue

            return response.text

        except requests.RequestException as exc:

            last_error = str(exc)

            inter_request_pause()

            time.sleep(1.5 * (attempt + 1))

    LOGGER.error("Google fetch failed for query %r: %s", query_param, last_error)

    return None





def parse_amazon_search_html_fragment(html_blob: str, limit: int) -> list[AmazonHit]:

    """Parse Amazon.es search results HTML (SERP) into AmazonHit rows."""

    soup = BeautifulSoup(html_blob, "html.parser")

    collected: list[AmazonHit] = []

    seen: set[str] = set()

    cards = soup.select('div[data-component-type="s-search-result"]')

    if not cards:

        cards = soup.find_all("div", attrs={"data-asin": True})

    for card in cards:

        if len(collected) >= limit:

            break

        asin = (card.get("data-asin") or "").strip().upper()

        if len(asin) != 10:

            holder = card.select_one("[data-asin]")

            if holder is not None:

                asin = (holder.get("data-asin") or "").strip().upper()

        if len(asin) != 10 or asin in seen:

            continue

        heading = card.find("h2")

        title_text = ""

        if heading:

            title_text = heading.get_text(strip=True, separator=" ")

        title_text = normalize_title(title_text, asin)

        image_node = card.find("img", class_=re.compile(r"\bs-image\b"))

        if image_node is None:

            image_node = card.find("img")

        thumb = ""

        if image_node is not None:

            thumb = (

                (image_node.get("src") or image_node.get("data-src") or "")

                .strip()

            )

        product_url: str | None = None

        anchor_candidate = None

        for anchor in card.find_all("a", href=True):

            if ASIN_RE.search(anchor["href"]):

                anchor_candidate = anchor

                break

        if not anchor_candidate and heading:

            anchor_candidate = heading.find("a", href=True)

        if anchor_candidate and anchor_candidate.get("href"):

            href = anchor_candidate["href"]

            if href.startswith("/"):

                href = "https://www.amazon.es" + href

            product_url, _ = canonical_dp_url(href)

        if not product_url:

            product_url, _ = canonical_dp_url(f"https://www.amazon.es/dp/{asin}")

        if not product_url:

            continue

        collected.append(

            AmazonHit(

                product_url_without_affiliate=product_url,

                asin=asin,

                title=title_text,

                image_url=thumb,

            )

        )

        seen.add(asin)

    if not collected and not amazon_serp_looks_blocked(html_blob):

        return amazon_hits_regex_fallback(html_blob, limit)

    return collected





def fetch_amazon_search_hits(

    session: requests.Session,

    keyword_stem: str,

    limit: int,

    timeout: float,

    retries: int,

) -> list[AmazonHit]:

    """

    Pull product candidates from Amazon.es search when Google HTML has no ASINs.

    """

    keywords = keyword_stem.replace("-", " ").strip()

    aggregated: list[AmazonHit] = []

    seen_asins: set[str] = set()

    last_error: str | None = None

    for page_index in range(1, 8):

        if len(aggregated) >= limit:

            break

        params = {"k": keywords, "page": str(page_index)}

        page_html: str | None = None

        for attempt in range(retries + 1):

            try:

                response = session.get(

                    "https://www.amazon.es/s",

                    params=params,

                    timeout=timeout,

                    headers={"Referer": "https://www.amazon.es/"},

                )

                inter_request_pause()

                if response.status_code != 200:

                    last_error = f"HTTP {response.status_code}"

                    time.sleep(1.2 * (attempt + 1))

                    continue

                page_html = response.text

                break

            except requests.RequestException as exc:

                last_error = str(exc)

                inter_request_pause()

                time.sleep(1.2 * (attempt + 1))

        if not page_html:

            LOGGER.error(

                "Amazon search failed for %r page %s: %s",

                keywords,

                page_index,

                last_error,

            )

            break

        chunk = parse_amazon_search_html_fragment(

            page_html,

            limit - len(aggregated),

        )

        for row in chunk:

            if row.asin in seen_asins:

                continue

            seen_asins.add(row.asin)

            aggregated.append(row)

            if len(aggregated) >= limit:

                break

        if not chunk:

            blob = page_html or ""

            blocked_hint = amazon_serp_looks_blocked(blob)

            sr_cnt = blob.count('data-component-type="s-search-result"')

            asin_matches = re.findall(r'data-asin="([A-Z0-9]{10})"', blob)

            LOGGER.info(

                "Amazon search page %s returned zero parsed hits "

                "(blocked_hint=%s s-search-result=%s distinct_data-asin=%s html_len=%s); "

                "stopping pagination",

                page_index,

                blocked_hint,

                sr_cnt,

                len(set(asin_matches)),

                len(blob),

            )

            if blocked_hint:

                LOGGER.warning(

                    "Amazon.es response resembles a bot/captcha interstitial; "

                    "this IP may be blocked. Try a residential connection or an exit "

                    "that matches normal amazon.es browsing (e.g. same locale)."

                )

            break

    if not aggregated:

        LOGGER.warning(

            "Amazon.es search produced no ASINs for keywords %r (%s)",

            keywords,

            last_error,

        )

    return aggregated[:limit]





def iter_target_files(root: Path, only_name: str | None) -> list[Path]:

    if only_name:

        candidate = root / only_name

        if not candidate.is_file():

            raise FileNotFoundError(f"Missing file {candidate}")

        return [candidate]



    files = sorted(root.glob("*.php"))

    return [

        item

        for item in files

        if item.name not in SKIP_FILENAMES and item.is_file()

    ]





def process_file(

    path: Path,

    session: requests.Session,

    args: argparse.Namespace,

) -> tuple[bool, bool]:

    text = path.read_text(encoding="utf-8", errors="replace")

    span = locate_main_section_span(text)

    if span is None:

        LOGGER.info("Skip %s (no main-results-section)", path.name)

        return False, False



    start_inner, end_inner = span

    section_snippet = text[start_inner:end_inner]

    slots = count_x2(section_snippet)

    if slots == 0:

        LOGGER.warning("%s: section has zero x2 tiles; leaving untouched", path.name)

        return False, False



    keyword_stem = path.stem

    html_blob = fetch_google_isch(

        session,

        keyword_stem,

        timeout=args.timeout,

        retries=args.retries,

    )

    hits: list[AmazonHit] = []

    if html_blob:

        hits = extract_amazon_hits(html_blob, limit=args.limit_products)

    if not hits and args.amazon_fallback:

        LOGGER.info(

            "%s: Google Images returned no ASINs (or fetch failed); using Amazon.es search",

            path.name,

        )

        hits = fetch_amazon_search_hits(

            session,

            keyword_stem,

            limit=args.limit_products,

            timeout=args.timeout,

            retries=args.retries,

        )

    if not hits:

        LOGGER.warning("%s: no product candidates after scraping", path.name)

        return False, False



    fill_count = min(slots, len(hits), args.limit_products)



    if fill_count < slots:

        LOGGER.warning(

            "%s: only %s products available (page had %s tiles)",

            path.name,

            fill_count,

            slots,

        )


    new_inner = render_section_inner(hits, fill_count)

    updated = replace_section_content(text, new_inner)

    if hits:

        first_thumb = hits[0].image_url or (
            f"https://m.media-amazon.com/images/P/{hits[0].asin}.01._SCLZZZZZZZ_.jpg"
        )

        updated = patch_imagen_relacionada(updated, first_thumb)



    if args.dry_run:

        LOGGER.info("[dry-run] Would update %s (%s cards)", path.name, fill_count)

        return True, False

    path.write_text(updated, encoding="utf-8", newline="\r\n")

    LOGGER.info("Updated %s (%s cards)", path.name, fill_count)

    return True, True





def build_arg_parser() -> argparse.ArgumentParser:

    parser = argparse.ArgumentParser(

        description="Refresh amazon.es product grids using Google Images.",

    )

    parser.add_argument(

        "--root",

        type=Path,

        default=Path(__file__).resolve().parent,

        help="Website root folder that contains PHP pages (default: script directory).",

    )

    parser.add_argument(

        "--only",

        type=str,

        default=None,

        help="Restrict processing to a single PHP filename located in root.",

    )

    parser.add_argument(

        "--progress-log",

        type=Path,

        default=None,

        help=(

            f"Log of successfully written PHP filenames (default: <root>/{PROGRESS_LOG_FILE_NAME})."

        ),

    )

    parser.add_argument(

        "--ignore-progress-log",

        action="store_true",

        help="Process every file even if it appears in the progress log.",

    )

    parser.add_argument(

        "--limit-pages",

        type=int,

        default=0,

        help="Stop after updating N qualifying pages (0 means no limit).",

    )

    parser.add_argument(

        "--limit-products",

        type=int,

        default=60,

        help="Maximum candidates to consume from each Google SERP batch.",

    )

    parser.add_argument(

        "--pause",

        type=float,

        default=0.0,

        help=(

            "Extra sleep after each PHP file (seconds); a random 200-500 ms pause "

            "runs after every Google/Amazon HTTP response."

        ),

    )

    parser.add_argument(

        "--timeout",

        type=float,

        default=25.0,

        help="HTTP timeout (seconds) for remote requests (Google or Amazon).",

    )

    parser.add_argument(

        "--retries",

        type=int,

        default=2,

        help="Retries for each HTTP request (Google or Amazon), beyond the first try.",

    )

    parser.add_argument(

        "--dry-run",

        action="store_true",

        help="Parse and log actions without modifying PHP files.",

    )



    parser.add_argument(

        "--skip-amazon-warmup",

        action="store_true",

        help=(

            "Do not GET https://www.amazon.es/ once before SERP requests "

            "(default primes cookies via homepage)."

        ),

    )



    parser.add_argument(

        "--no-amazon-fallback",

        dest="amazon_fallback",

        action="store_false",

        help="Disable Amazon.es search fallback; only use Google Images HTML.",

    )



    parser.set_defaults(amazon_fallback=True, skip_amazon_warmup=False)

    return parser





def main() -> int:

    logging.basicConfig(

        level=logging.INFO,

        format="%(levelname)s %(message)s",

    )

    parser = build_arg_parser()

    args = parser.parse_args()

    progress_path = (

        args.progress_log

        if args.progress_log is not None

        else (args.root / PROGRESS_LOG_FILE_NAME)

    )

    resume_skip: set[str] = (

        set()

        if args.ignore_progress_log

        else load_progress_log(progress_path)

    )

    if resume_skip:

        LOGGER.info(

            "Resume: skipping %s file(s) listed in %s",

            len(resume_skip),

            progress_path,

        )



    targets = iter_target_files(args.root, args.only)

    processed = 0



    session = requests.Session()

    configure_http_session_headers(session)

    if not args.skip_amazon_warmup:

        warmup_amazon_homepage(session, args.timeout, args.retries)



    for page in targets:

        if args.limit_pages > 0 and processed >= args.limit_pages:

            LOGGER.info(

                "limit-pages threshold reached (%s); stopping",

                args.limit_pages,

            )

            break

        if (

            not args.ignore_progress_log

            and page.name in resume_skip

        ):

            LOGGER.info("Skip %s (already processed per log)", page.name)

            continue



        LOGGER.info("Processing %s …", page.name)

        handled = False

        persisted_disk = False

        try:

            handled, persisted_disk = process_file(page, session, args)

        except (

            UnicodeDecodeError,

            ValueError,

            OSError,

        ) as exc:

            LOGGER.error("Failed %s: %s", page.name, exc)



        if persisted_disk:

            append_progress_entry(progress_path, page.name)

            resume_skip.add(page.name)



        time.sleep(max(args.pause, 0.0))

        if handled and args.limit_pages > 0:

            processed += 1



    return 0





if __name__ == "__main__":

    sys.exit(main())

