fix orchestrator statemachine and Process from archiveresult migrations

2026-01-15 12:15:10 +00:00 · 2026-01-01 16:43:02 -08:00
parent 876feac522
commit 60422adc87
13 changed files with 378 additions and 96 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -26,7 +26,8 @@
      "Bash(grep:*)",
      "WebFetch(domain:python-statemachine.readthedocs.io)",
      "Bash(./bin/run_plugin_tests.sh:*)",
-      "Bash(done)"
+      "Bash(done)",
+      "Bash(coverage erase:*)"
    ]
  }
 }
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ tests/out/
 .coverage
 .coverage.*
 coverage.json
+coverage/
 htmlcov/

 # Python and Node dependencies
--- a/archivebox/config/constants.py
+++ b/archivebox/config/constants.py
@@ -173,6 +173,8 @@ class ConstantsDict(Mapping):
        CUSTOM_TEMPLATES_DIR_NAME,
        CUSTOM_PLUGINS_DIR_NAME,
        CRONTABS_DIR_NAME,
+        "invalid",
+        "users",
        # Backwards compatibility with old directory names
        "user_plugins",          # old name for USER_PLUGINS_DIR (now 'plugins')
        "user_templates",        # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
--- a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
+++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
@@ -57,18 +57,8 @@ class Migration(migrations.Migration):
            name='snapshot',
            options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
        ),
-        migrations.RemoveField(
-            model_name='archiveresult',
-            name='cmd',
-        ),
-        migrations.RemoveField(
-            model_name='archiveresult',
-            name='cmd_version',
-        ),
-        migrations.RemoveField(
-            model_name='archiveresult',
-            name='pwd',
-        ),
+        # NOTE: RemoveField for cmd, cmd_version, pwd moved to migration 0027
+        # to allow data migration to Process records first
        migrations.AddField(
            model_name='archiveresult',
            name='config',
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -2208,7 +2208,7 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
    tick = (
        queued.to.itself(unless='can_start') |
        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished') |
+        started.to.itself(unless='is_finished', on='on_started_to_started') |
        started.to(sealed, cond='is_finished')
    )

@@ -2243,6 +2243,13 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
            status=Snapshot.StatusChoices.STARTED,
        )

+    def on_started_to_started(self):
+        """Called when Snapshot stays in started state (archiveresults not finished yet)."""
+        # Bump retry_at so we check again in a few seconds
+        self.snapshot.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=5),
+        )
+
    @sealed.enter
    def enter_sealed(self):
        # Clean up background hooks
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -502,7 +502,7 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
    tick = (
        queued.to.itself(unless='can_start') |
        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished') |
+        started.to.itself(unless='is_finished', on='on_started_to_started') |
        started.to(sealed, cond='is_finished')
    )

--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -1201,6 +1201,14 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
            # Dispatch to appropriate model's from_json() method
            if record_type == 'Snapshot':
                from archivebox.core.models import Snapshot
+
+                # Check if discovered snapshot exceeds crawl max_depth
+                snapshot_depth = record.get('depth', 0)
+                crawl = overrides.get('crawl')
+                if crawl and snapshot_depth > crawl.max_depth:
+                    # Skip - this URL was discovered but exceeds max crawl depth
+                    continue
+
                obj = Snapshot.from_json(record.copy(), overrides)
                if obj:
                    stats['Snapshot'] = stats.get('Snapshot', 0) + 1
--- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py
+++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py
@@ -163,8 +163,10 @@ def fetch_content(url: str) -> str:

@click.command()
@click.option('--url', required=True, help='Netscape bookmark file URL to parse')
-@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
-def main(url: str, snapshot_id: str = None):
+@click.option('--snapshot-id', required=False, help='Parent Snapshot UUID')
+@click.option('--crawl-id', required=False, help='Crawl UUID')
+@click.option('--depth', type=int, default=0, help='Current depth level')
+def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
    """Parse Netscape bookmark HTML and extract URLs."""

    try:
@@ -188,7 +190,12 @@ def main(url: str, snapshot_id: str = None):
                'type': 'Snapshot',
                'url': unescape(bookmark_url),
                'plugin': PLUGIN_NAME,
+                'depth': depth + 1,
            }
+            if snapshot_id:
+                entry['parent_snapshot_id'] = snapshot_id
+            if crawl_id:
+                entry['crawl_id'] = crawl_id
            if title:
                entry['title'] = unescape(title)
            if tags_str:
--- a/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py
+++ b/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py
@@ -100,8 +100,10 @@ def fetch_content(url: str) -> str:

@click.command()
@click.option('--url', required=True, help='URL to parse (file:// or https://)')
-@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
-def main(url: str, snapshot_id: str = None):
+@click.option('--snapshot-id', required=False, help='Parent Snapshot UUID')
+@click.option('--crawl-id', required=False, help='Crawl UUID')
+@click.option('--depth', type=int, default=0, help='Current depth level')
+def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
    """Parse plain text and extract URLs."""

    try:
@@ -123,9 +125,12 @@ def main(url: str, snapshot_id: str = None):
            'type': 'Snapshot',
            'url': found_url,
            'plugin': PLUGIN_NAME,
+            'depth': depth + 1,
        }
        if snapshot_id:
            record['parent_snapshot_id'] = snapshot_id
+        if crawl_id:
+            record['crawl_id'] = crawl_id
        print(json.dumps(record))

    # Emit ArchiveResult record to mark completion
--- a/archivebox/tests/test_migrations_08_to_09.py
+++ b/archivebox/tests/test_migrations_08_to_09.py
@@ -30,6 +30,7 @@ from .test_migrations_helpers import (
    verify_foreign_keys,
    verify_all_snapshots_in_output,
    verify_crawl_count,
+    verify_process_migration,
 )


@@ -260,6 +261,54 @@ class TestMigrationFrom08x(unittest.TestCase):
        self.assertTrue('ArchiveBox' in output or 'version' in output.lower(),
                       f"Version output missing expected content: {output[:500]}")

+    def test_migration_creates_process_records(self):
+        """Migration should create Process records for all ArchiveResults."""
+        result = run_archivebox(self.work_dir, ['init'], timeout=45)
+        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
+
+        # Verify Process records created
+        expected_count = len(self.original_data['archiveresults'])
+        ok, msg = verify_process_migration(self.db_path, expected_count)
+        self.assertTrue(ok, msg)
+
+    def test_migration_creates_binary_records(self):
+        """Migration should create Binary records from cmd_version data."""
+        result = run_archivebox(self.work_dir, ['init'], timeout=45)
+        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
+
+        conn = sqlite3.connect(str(self.db_path))
+        cursor = conn.cursor()
+
+        # Check Binary records exist
+        cursor.execute("SELECT COUNT(*) FROM machine_binary")
+        binary_count = cursor.fetchone()[0]
+
+        # Should have at least one binary per unique extractor
+        extractors = set(ar['extractor'] for ar in self.original_data['archiveresults'])
+        self.assertGreaterEqual(binary_count, len(extractors),
+                              f"Expected at least {len(extractors)} Binaries, got {binary_count}")
+
+        conn.close()
+
+    def test_migration_preserves_cmd_data(self):
+        """Migration should preserve cmd data in Process.cmd field."""
+        result = run_archivebox(self.work_dir, ['init'], timeout=45)
+        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
+
+        conn = sqlite3.connect(str(self.db_path))
+        cursor = conn.cursor()
+
+        # Check that Process records have cmd arrays
+        cursor.execute("SELECT cmd FROM machine_process WHERE cmd != '[]'")
+        cmd_records = cursor.fetchall()
+
+        # All Processes should have non-empty cmd (test data has json.dumps([extractor, '--version']))
+        expected_count = len(self.original_data['archiveresults'])
+        self.assertEqual(len(cmd_records), expected_count,
+                        f"Expected {expected_count} Processes with cmd, got {len(cmd_records)}")
+
+        conn.close()
+

 class TestMigrationDataIntegrity08x(unittest.TestCase):
    """Comprehensive data integrity tests for 0.8.x migrations."""
--- a/archivebox/tests/test_migrations_helpers.py
+++ b/archivebox/tests/test_migrations_helpers.py
@@ -730,44 +730,26 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
        tag_id = cursor.lastrowid
        created_data['tags'].append({'id': tag_id, 'name': name, 'slug': name.lower()})

-    # Create Seeds first (required for 0.8.x Crawls)
-    test_seeds = [
-        ('https://example.com', 'auto', 'Example Seed'),
-        ('https://github.com/ArchiveBox', 'auto', 'GitHub Seed'),
-    ]
-
-    created_data['seeds'] = []
-    for uri, extractor, label in test_seeds:
-        seed_id = generate_uuid()
-        cursor.execute("""
-            INSERT INTO crawls_seed (id, created_at, created_by_id, modified_at, uri,
-                                     extractor, tags_str, label, config, output_dir, notes,
-                                     num_uses_failed, num_uses_succeeded)
-            VALUES (?, datetime('now'), ?, datetime('now'), ?, ?, '', ?, '{}', '', '', 0, 0)
-        """, (seed_id, user_id, uri, extractor, label))
-        created_data['seeds'].append({'id': seed_id, 'uri': uri, 'label': label})
-
-    # Create 2 Crawls (linked to Seeds)
+    # Create 2 Crawls (0.9.0 schema - no seeds)
    test_crawls = [
-        ('https://example.com\nhttps://example.org', 0, 'Example Crawl', created_data['seeds'][0]['id']),
-        ('https://github.com/ArchiveBox', 1, 'GitHub Crawl', created_data['seeds'][1]['id']),
+        ('https://example.com\nhttps://example.org', 0, 'Example Crawl'),
+        ('https://github.com/ArchiveBox', 1, 'GitHub Crawl'),
    ]

-    for i, (urls, max_depth, label, seed_id) in enumerate(test_crawls):
+    for i, (urls, max_depth, label) in enumerate(test_crawls):
        crawl_id = generate_uuid()
        cursor.execute("""
-            INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, seed_id, urls,
+            INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, urls,
                                      config, max_depth, tags_str, label, status, retry_at,
                                      num_uses_failed, num_uses_succeeded)
-            VALUES (?, datetime('now'), ?, datetime('now'), ?, ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0)
-        """, (crawl_id, user_id, seed_id, urls, max_depth, label))
+            VALUES (?, datetime('now'), ?, datetime('now'), ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0)
+        """, (crawl_id, user_id, urls, max_depth, label))

        created_data['crawls'].append({
            'id': crawl_id,
            'urls': urls,
            'max_depth': max_depth,
            'label': label,
-            'seed_id': seed_id,
        })

    # Create 5 snapshots linked to crawls
@@ -1146,3 +1128,64 @@ def verify_crawl_count(db_path: Path, expected: int) -> Tuple[bool, str]:
    if count == expected:
        return True, f"Crawl count OK: {count}"
    return False, f"Crawl count mismatch: expected {expected}, got {count}"
+
+
+def verify_process_migration(db_path: Path, expected_archiveresult_count: int) -> Tuple[bool, str]:
+    """
+    Verify that ArchiveResults were properly migrated to Process records.
+
+    Checks:
+    1. All ArchiveResults have process_id set
+    2. Process count matches ArchiveResult count
+    3. Binary records created for unique cmd_version values
+    4. Status mapping is correct
+    """
+    conn = sqlite3.connect(str(db_path))
+    cursor = conn.cursor()
+
+    # Check all ArchiveResults have process_id
+    cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NULL")
+    null_count = cursor.fetchone()[0]
+
+    if null_count > 0:
+        conn.close()
+        return False, f"Found {null_count} ArchiveResults without process_id"
+
+    # Check Process count
+    cursor.execute("SELECT COUNT(*) FROM machine_process")
+    process_count = cursor.fetchone()[0]
+
+    if process_count != expected_archiveresult_count:
+        conn.close()
+        return False, f"Expected {expected_archiveresult_count} Processes, got {process_count}"
+
+    # Check status mapping
+    cursor.execute("""
+        SELECT ar.status, p.status, p.exit_code
+        FROM core_archiveresult ar
+        JOIN machine_process p ON ar.process_id = p.id
+    """)
+
+    status_errors = []
+    for ar_status, p_status, p_exit_code in cursor.fetchall():
+        expected_p_status, expected_exit_code = {
+            'queued': ('queued', None),
+            'started': ('running', None),
+            'backoff': ('queued', None),
+            'succeeded': ('exited', 0),
+            'failed': ('exited', 1),
+            'skipped': ('exited', None),
+        }.get(ar_status, ('queued', None))
+
+        if p_status != expected_p_status:
+            status_errors.append(f"AR status {ar_status} → Process {p_status}, expected {expected_p_status}")
+
+        if p_exit_code != expected_exit_code:
+            status_errors.append(f"AR status {ar_status} → exit_code {p_exit_code}, expected {expected_exit_code}")
+
+    if status_errors:
+        conn.close()
+        return False, f"Status mapping errors: {'; '.join(status_errors[:5])}"
+
+    conn.close()
+    return True, f"Process migration verified: {process_count} Processes created"
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -175,8 +175,50 @@ class Orchestrator:
        """Spawn a new worker process. Returns PID or None if spawn failed."""
        try:
            pid = WorkerClass.start(daemon=False)
-            # Worker spawning is logged by the worker itself in on_startup()
-            return pid
+
+            # CRITICAL: Block until worker registers itself in Process table
+            # This prevents race condition where orchestrator spawns multiple workers
+            # before any of them finish on_startup() and register
+            from archivebox.machine.models import Process
+            import time
+
+            timeout = 5.0  # seconds to wait for worker registration
+            poll_interval = 0.1  # check every 100ms
+            elapsed = 0.0
+            spawn_time = timezone.now()
+
+            while elapsed < timeout:
+                # Check if worker process is registered with strict criteria:
+                # 1. Correct PID
+                # 2. WORKER process type
+                # 3. RUNNING status
+                # 4. Parent is this orchestrator
+                # 5. Started recently (within last 10 seconds)
+                worker_process = Process.objects.filter(
+                    pid=pid,
+                    process_type=Process.TypeChoices.WORKER,
+                    status=Process.StatusChoices.RUNNING,
+                    parent_id=self.db_process.id,
+                    started_at__gte=spawn_time - timedelta(seconds=10),
+                ).first()
+
+                if worker_process:
+                    # Worker successfully registered!
+                    return pid
+
+                time.sleep(poll_interval)
+                elapsed += poll_interval
+
+            # Timeout - worker failed to register
+            log_worker_event(
+                worker_type='Orchestrator',
+                event='Worker failed to register in time',
+                indent_level=0,
+                pid=self.pid,
+                metadata={'worker_type': WorkerClass.name, 'worker_pid': pid, 'timeout': timeout},
+            )
+            return None
+
        except Exception as e:
            log_worker_event(
                worker_type='Orchestrator',
@@ -266,48 +308,75 @@ class Orchestrator:
    def runloop(self) -> None:
        """Main orchestrator loop."""
        from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn
-        from archivebox.misc.logging import IS_TTY
-        import archivebox.misc.logging as logging_module
+        from archivebox.misc.logging import IS_TTY, CONSOLE
+        import sys
+        import os

        # Enable progress bars only in TTY + foreground mode
        show_progress = IS_TTY and self.exit_on_idle

-        # Save original consoles
-        original_console = logging_module.CONSOLE
-        original_stderr = logging_module.STDERR
+        # Debug
+        print(f"[yellow]DEBUG: IS_TTY={IS_TTY}, exit_on_idle={self.exit_on_idle}, show_progress={show_progress}[/yellow]")

-        # Create Progress with the console it will control
-        progress = Progress(
-            TextColumn("[cyan]{task.description}"),
-            BarColumn(bar_width=40),
-            TaskProgressColumn(),
-            transient=False,
-            console=original_console,  # Use the original console
-        ) if show_progress else None
+        self.on_startup()
+        task_ids = {}

-        task_ids = {}  # snapshot_id -> task_id
+        if not show_progress:
+            # No progress bars - just run normally
+            self._run_orchestrator_loop(None, task_ids, None, None)
+        else:
+            # Redirect worker subprocess output to /dev/null
+            devnull_fd = os.open(os.devnull, os.O_WRONLY)

-        # Wrapper to convert console.print() to console.log() for Rich Progress
-        class ConsoleLogWrapper:
-            def __init__(self, console):
-                self._console = console
-            def print(self, *args, **kwargs):
-                # Use log() instead of print() to work with Live display
-                self._console.log(*args)
-            def __getattr__(self, name):
-                return getattr(self._console, name)
+            # Save original stdout/stderr (make 2 copies - one for Console, one for restoring)
+            original_stdout = sys.stdout.fileno()
+            original_stderr = sys.stderr.fileno()
+            stdout_for_console = os.dup(original_stdout)
+            stdout_for_restore = os.dup(original_stdout)
+            stderr_for_restore = os.dup(original_stderr)

+            try:
+                # Redirect stdout/stderr to /dev/null (workers will inherit this)
+                os.dup2(devnull_fd, original_stdout)
+                os.dup2(devnull_fd, original_stderr)
+
+                # Create Console using saved stdout (not the redirected one)
+                from rich.console import Console
+                import archivebox.misc.logging as logging_module
+                orchestrator_console = Console(file=os.fdopen(stdout_for_console, 'w'), force_terminal=True)
+
+                # Update global CONSOLE so orchestrator logs appear too
+                original_console = logging_module.CONSOLE
+                logging_module.CONSOLE = orchestrator_console
+
+                # Now create Progress and run loop (DON'T restore stdout/stderr - workers need /dev/null)
+                with Progress(
+                    TextColumn("[cyan]{task.description}"),
+                    BarColumn(bar_width=40),
+                    TaskProgressColumn(),
+                    console=orchestrator_console,
+                ) as progress:
+                    self._run_orchestrator_loop(progress, task_ids, None, None)
+
+                # Restore original console
+                logging_module.CONSOLE = original_console
+            finally:
+                # Restore stdout/stderr
+                os.dup2(stdout_for_restore, original_stdout)
+                os.dup2(stderr_for_restore, original_stderr)
+
+                # Cleanup
+                try:
+                    os.close(devnull_fd)
+                    os.close(stdout_for_restore)
+                    os.close(stderr_for_restore)
+                except:
+                    pass
+                # stdout_for_console is closed by orchestrator_console
+
+    def _run_orchestrator_loop(self, progress, task_ids, read_fd, console):
+        """Run the main orchestrator loop with optional progress display."""
        try:
-            if progress:
-                progress.start()
-                # Wrap progress.console so print() calls become log() calls
-                wrapped_console = ConsoleLogWrapper(progress.console)
-                logging_module.CONSOLE = wrapped_console
-                logging_module.STDERR = wrapped_console
-
-            # Call on_startup AFTER redirecting consoles
-            self.on_startup()
-
            while True:
                # Check queues and spawn workers
                queue_sizes = self.check_queues_and_spawn_workers()
@@ -333,12 +402,33 @@ class Orchestrator:
                            status__in=['succeeded', 'skipped', 'failed']
                        ).count()

+                        # Find currently running hook (ordered by hook_name to get lowest step number)
+                        current_ar = snapshot.archiveresult_set.filter(status='started').order_by('hook_name').first()
+                        if not current_ar:
+                            # If nothing running, show next queued item (ordered to get next in sequence)
+                            current_ar = snapshot.archiveresult_set.filter(status='queued').order_by('hook_name').first()
+
+                        current_plugin = ''
+                        if current_ar:
+                            # Use hook_name if available, otherwise plugin name
+                            hook_name = current_ar.hook_name or current_ar.plugin or ''
+                            # Extract just the hook name without path (e.g., "on_Snapshot__50_wget.py" -> "wget")
+                            if hook_name:
+                                # Clean up the name: remove prefix and extension
+                                clean_name = hook_name.split('__')[-1] if '__' in hook_name else hook_name
+                                clean_name = clean_name.replace('.py', '').replace('.sh', '').replace('.bg', '')
+                                current_plugin = f" • {clean_name}"
+
+                        # Build description with URL + current plugin
+                        url = snapshot.url[:50] + '...' if len(snapshot.url) > 50 else snapshot.url
+                        description = f"{url}{current_plugin}"
+
                        # Create or update task
                        if snapshot.id not in task_ids:
-                            url = snapshot.url[:60] + '...' if len(snapshot.url) > 60 else snapshot.url
-                            task_ids[snapshot.id] = progress.add_task(url, total=total, completed=completed)
+                            task_ids[snapshot.id] = progress.add_task(description, total=total, completed=completed)
                        else:
-                            progress.update(task_ids[snapshot.id], completed=completed)
+                            # Update both progress and description
+                            progress.update(task_ids[snapshot.id], description=description, completed=completed)

                    # Remove tasks for snapshots that are no longer active
                    for snapshot_id in list(task_ids.keys()):
@@ -373,12 +463,6 @@ class Orchestrator:
            raise
        else:
            self.on_shutdown()
-        finally:
-            if progress:
-                # Restore original consoles
-                logging_module.CONSOLE = original_console
-                logging_module.STDERR = original_stderr
-                progress.stop()
    
    def start(self) -> int:
        """
--- a/bin/test_plugins.sh
+++ b/bin/test_plugins.sh
@@ -1,14 +1,20 @@
 #!/bin/bash
-# Run ArchiveBox plugin tests
+# Run ArchiveBox plugin tests with coverage
 #
 # All plugin tests use pytest and are located in pluginname/tests/test_*.py
 #
-# Usage: ./bin/run_plugin_tests.sh [plugin_name]
+# Usage: ./bin/test_plugins.sh [plugin_name] [--no-coverage]
 #
 # Examples:
-#   ./bin/run_plugin_tests.sh                 # Run all plugin tests
-#   ./bin/run_plugin_tests.sh chrome          # Run chrome plugin tests
-#   ./bin/run_plugin_tests.sh parse_*         # Run all parse_* plugin tests
+#   ./bin/test_plugins.sh                     # Run all plugin tests with coverage
+#   ./bin/test_plugins.sh chrome              # Run chrome plugin tests with coverage
+#   ./bin/test_plugins.sh parse_*             # Run all parse_* plugin tests with coverage
+#   ./bin/test_plugins.sh --no-coverage       # Run all tests without coverage
+#
+# Coverage results are saved to .coverage and can be viewed with:
+#   coverage combine
+#   coverage report
+#   coverage json

 set -e

@@ -18,11 +24,43 @@ RED='\033[0;31m'
 YELLOW='\033[1;33m'
 NC='\033[0m' # No Color

+# Save root directory first
+ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+
 # Parse arguments
-PLUGIN_FILTER="${1:-}"
+PLUGIN_FILTER=""
+ENABLE_COVERAGE=true
+
+for arg in "$@"; do
+    if [ "$arg" = "--no-coverage" ]; then
+        ENABLE_COVERAGE=false
+    else
+        PLUGIN_FILTER="$arg"
+    fi
+done
+
+# Reset coverage data if collecting coverage
+if [ "$ENABLE_COVERAGE" = true ]; then
+    echo "Resetting coverage data..."
+    cd "$ROOT_DIR" || exit 1
+    coverage erase
+    rm -rf "$ROOT_DIR/coverage/js" 2>/dev/null
+    mkdir -p "$ROOT_DIR/coverage/js"
+
+    # Enable Python subprocess coverage
+    export COVERAGE_PROCESS_START="$ROOT_DIR/pyproject.toml"
+    export PYTHONPATH="$ROOT_DIR:$PYTHONPATH"  # For sitecustomize.py
+
+    # Enable Node.js V8 coverage (built-in, no packages needed)
+    export NODE_V8_COVERAGE="$ROOT_DIR/coverage/js"
+
+    echo "Python coverage: enabled (subprocess support)"
+    echo "JavaScript coverage: enabled (NODE_V8_COVERAGE)"
+    echo ""
+fi

 # Change to plugins directory
-cd "$(dirname "$0")/../archivebox/plugins" || exit 1
+cd "$ROOT_DIR/archivebox/plugins" || exit 1

 echo "=========================================="
 echo "ArchiveBox Plugin Tests"
@@ -34,6 +72,12 @@ if [ -n "$PLUGIN_FILTER" ]; then
 else
    echo "Running all plugin tests"
 fi
+
+if [ "$ENABLE_COVERAGE" = true ]; then
+    echo "Coverage: enabled"
+else
+    echo "Coverage: disabled"
+fi
 echo ""

 # Track results
@@ -67,7 +111,13 @@ for test_dir in $TEST_DIRS; do

    echo -e "${YELLOW}[RUNNING]${NC} $plugin_name"

-    if python -m pytest "$test_dir" -p no:django -v --tb=short 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then
+    # Build pytest command with optional coverage
+    PYTEST_CMD="python -m pytest $test_dir -p no:django -v --tb=short"
+    if [ "$ENABLE_COVERAGE" = true ]; then
+        PYTEST_CMD="$PYTEST_CMD --cov=$plugin_name --cov-append --cov-branch"
+    fi
+
+    if eval "$PYTEST_CMD" 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then
        echo -e "${GREEN}[PASSED]${NC} $plugin_name"
        PASSED_PLUGINS=$((PASSED_PLUGINS + 1))
    else
@@ -91,6 +141,41 @@ if [ $TOTAL_PLUGINS -eq 0 ]; then
    exit 0
 elif [ $FAILED_PLUGINS -eq 0 ]; then
    echo -e "${GREEN}✓ All plugin tests passed!${NC}"
+
+    # Show coverage summary if enabled
+    if [ "$ENABLE_COVERAGE" = true ]; then
+        echo ""
+        echo "=========================================="
+        echo "Python Coverage Summary"
+        echo "=========================================="
+        # Coverage data is in ROOT_DIR, combine and report from there
+        cd "$ROOT_DIR" || exit 1
+        # Copy coverage data from plugins dir if it exists
+        if [ -f "$ROOT_DIR/archivebox/plugins/.coverage" ]; then
+            cp "$ROOT_DIR/archivebox/plugins/.coverage" "$ROOT_DIR/.coverage"
+        fi
+        coverage combine 2>/dev/null || true
+        coverage report --include="archivebox/plugins/*" --omit="*/tests/*" 2>&1 | head -50
+        echo ""
+
+        echo "=========================================="
+        echo "JavaScript Coverage Summary"
+        echo "=========================================="
+        if [ -d "$ROOT_DIR/coverage/js" ] && [ "$(ls -A "$ROOT_DIR/coverage/js" 2>/dev/null)" ]; then
+            node "$ROOT_DIR/bin/convert_v8_coverage.js" "$ROOT_DIR/coverage/js"
+        else
+            echo "No JavaScript coverage data collected"
+            echo "(JS hooks may not have been executed during tests)"
+        fi
+        echo ""
+
+        echo "For detailed coverage reports (from project root):"
+        echo "  Python:     coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*'"
+        echo "  Python:     coverage json  # LLM-friendly format"
+        echo "  Python:     coverage html  # Interactive HTML report"
+        echo "  JavaScript: node bin/convert_v8_coverage.js coverage/js"
+    fi
+
    exit 0
 else
    echo -e "${RED}✗ Some plugin tests failed${NC}"