This commit is contained in:
Nick Sweeting
2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions

View File

@@ -743,7 +743,7 @@ def update(filter_patterns: Iterable[str] = (),
from archivebox.config.django import setup_django
setup_django()
from core.models import Snapshot
from archivebox.core.models import Snapshot
from django.utils import timezone
while True:
@@ -790,7 +790,7 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100)
Skip symlinks (already migrated).
Create DB records and trigger migration on save().
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from archivebox.config import CONSTANTS
from django.db import transaction
@@ -858,7 +858,7 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
Process all snapshots in DB.
Reconcile index.json and queue for archiving.
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from django.db import transaction
from django.utils import timezone
@@ -896,7 +896,7 @@ def process_filtered_snapshots(
batch_size: int
) -> dict:
"""Process snapshots matching filters (DB query only)."""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from django.db import transaction
from django.utils import timezone
from datetime import datetime
@@ -1042,7 +1042,7 @@ def search(filter_patterns: list[str] | None=None,
with_headers: bool=False):
"""List, filter, and export information about archive entries"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
if with_headers and not (json or html or csv):
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')

View File

@@ -658,7 +658,7 @@ def create_model_record(record: dict) -> Any:
Returns:
Created/updated model instance
"""
from machine.models import Binary, Dependency
from archivebox.machine.models import Binary, Dependency
model_type = record.pop('type')
@@ -917,7 +917,7 @@ def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
if not cmd:
return None
from machine.models import Binary
from archivebox.machine.models import Binary
bin_path_or_name = cmd[0]
@@ -977,7 +977,7 @@ def run_hook(
"""
import time
from datetime import datetime, timezone
from machine.models import Machine
from archivebox.machine.models import Machine
start_time = time.time()
@@ -1125,7 +1125,7 @@ def run(self):
"""
from django.utils import timezone
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, find_binary_for_cmd, create_model_record
from machine.models import Machine
from archivebox.machine.models import Machine
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
@@ -1458,7 +1458,7 @@ def finalize_background_hook(archiveresult: 'ArchiveResult') -> None:
archiveresult: ArchiveResult instance to finalize
"""
from django.utils import timezone
from machine.models import Machine
from archivebox.machine.models import Machine
extractor_dir = Path(archiveresult.pwd)
stdout_file = extractor_dir / 'stdout.log'

View File

@@ -173,15 +173,15 @@ def process_hook_records(records: List[Dict], overrides: Dict = None) -> Dict[st
# Dispatch to appropriate model
if record_type == 'Snapshot':
from core.models import Snapshot
from archivebox.core.models import Snapshot
Snapshot.from_jsonl(record, overrides)
stats['Snapshot'] = stats.get('Snapshot', 0) + 1
elif record_type == 'Tag':
from core.models import Tag
from archivebox.core.models import Tag
Tag.from_jsonl(record, overrides)
stats['Tag'] = stats.get('Tag', 0) + 1
elif record_type == 'Binary':
from machine.models import Binary
from archivebox.machine.models import Binary
Binary.from_jsonl(record, overrides)
stats['Binary'] = stats.get('Binary', 0) + 1
# ... etc
@@ -526,7 +526,7 @@ class Model:
# Update children from filesystem
child.update_from_output()
def update_for_workers(self, **fields):
def update_and_requeue(self, **fields):
"""Update fields and bump modified_at."""
for field, value in fields.items():
setattr(self, field, value)
@@ -575,7 +575,7 @@ All core models (Crawl, Snapshot, ArchiveResult) now follow the unified pattern:
- State machines orchestrate transitions
- `.run()` methods execute hooks and process JSONL
- `.cleanup()` methods kill background hooks
- `.update_for_workers()` methods update state for worker coordination
- `.update_and_requeue()` methods update state for worker coordination
- Consistent use of `process_hook_records()` for JSONL dispatching
### ✅ Phases 7-8: Binary State Machine (Dependency Model Eliminated)