mirror of
https://github.com/zebrajr/ArchiveBox.git
synced 2026-01-15 12:15:10 +00:00
add overrides options to binproviders
This commit is contained in:
@@ -14,7 +14,12 @@
|
||||
"Bash(mkdir:*)",
|
||||
"Bash(chmod:*)",
|
||||
"Bash(python -m forum_dl:*)",
|
||||
"Bash(archivebox manage migrate:*)"
|
||||
"Bash(archivebox manage migrate:*)",
|
||||
"Bash(cat:*)",
|
||||
"Bash(python archivebox/plugins/pip/on_Dependency__install_using_pip_provider.py:*)",
|
||||
"Bash(forum-dl:*)",
|
||||
"Bash(pip uninstall:*)",
|
||||
"Bash(python:*)"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -186,11 +186,12 @@ class Migration(migrations.Migration):
|
||||
|
||||
---
|
||||
|
||||
## Phase 2: Hook Output Format
|
||||
## Phase 2: Hook Output Format Specification
|
||||
|
||||
### Hooks emit single JSON object to stdout
|
||||
|
||||
**Contract:**
|
||||
- Hook scripts must be executable (chmod +x) and specify their interpreter at the top with a /usr/bin/env shebang line
|
||||
- Hook emits ONE JSON object with `type: 'ArchiveResult'`
|
||||
- Hook can provide: `status`, `output_str`, `output_json`, `cmd` (optional)
|
||||
- Hook should NOT set: `output_files`, `output_size`, `output_mimetypes` (runner calculates these)
|
||||
@@ -203,37 +204,23 @@ class Migration(migrations.Migration):
|
||||
// Simple string output
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_str: 'Downloaded index.html (4.2 KB)'
|
||||
output_str: 'This is the page title',
|
||||
}));
|
||||
|
||||
// With structured metadata (headers, redirects, etc.)
|
||||
// With structured metadata and optional fields (headers, redirects, etc.)
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_str: 'Archived https://example.com',
|
||||
output_json: {
|
||||
headers: {'content-type': 'text/html', 'server': 'nginx'},
|
||||
redirects: [{from: 'http://example.com', to: 'https://example.com'}]
|
||||
}
|
||||
output_str: 'Got https://example.com headers',
|
||||
output_json: {'content-type': 'text/html', 'server': 'nginx', 'status-code': 200, 'content-length': 234235},
|
||||
}));
|
||||
|
||||
// With explicit cmd (for binary FK)
|
||||
// With explicit cmd (cmd first arg should match InstalledBinary.bin_abspath or XYZ_BINARY env var so ArchiveResult.run() can FK to the InstalledBinary)
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_str: 'Archived with wget',
|
||||
cmd: ['wget', '-p', '-k', 'https://example.com']
|
||||
}));
|
||||
|
||||
// Just structured data (no human-readable string)
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_json: {
|
||||
title: 'My Page Title',
|
||||
charset: 'UTF-8'
|
||||
}
|
||||
cmd: ['/some/abspath/to/wget', '-p', '-k', 'https://example.com']
|
||||
}));
|
||||
|
||||
// BAD: Don't duplicate ArchiveResult fields in output_json
|
||||
@@ -241,16 +228,17 @@ console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_json: {
|
||||
status: 'succeeded', // ❌ BAD - duplicates ArchiveResult.status
|
||||
output_files: ['index.html'], // ❌ BAD - runner calculates this
|
||||
custom_data: 'ok' // ✅ GOOD - custom fields only
|
||||
}
|
||||
status: 'succeeded', // ❌ BAD - this should be up a level on ArchiveResult.status, not inside output_json
|
||||
title: 'the page title', // ❌ BAD - if the extractor's main output is just a string then it belongs in output_str
|
||||
custom_data: 1234, // ✅ GOOD - custom fields only
|
||||
},
|
||||
output_files: {'index.html': {}}, // ❌ BAD - runner calculates this for us, no need to return it manually
|
||||
}));
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 3: run_hook() is Generic (No HookResult TypedDict)
|
||||
## Phase 3: Architecture - Generic run_hook()
|
||||
|
||||
`run_hook()` is a generic JSONL parser - it doesn't know about ArchiveResult, InstalledBinary, or any specific model. It just:
|
||||
1. Executes the hook script
|
||||
@@ -276,8 +264,8 @@ def run_hook(
|
||||
|
||||
Each Model.run() method handles its own record types differently:
|
||||
- ArchiveResult.run() extends ArchiveResult records with computed fields
|
||||
- Machine.run() creates InstalledBinary records from hook output
|
||||
- etc.
|
||||
- Dependency.run() creates InstalledBinary records from hook output
|
||||
- Crawl.run() can create Dependency records, Snapshots, or InstalledBinary records from hook output
|
||||
|
||||
Returns:
|
||||
List of dicts with 'type' field, each extended with metadata:
|
||||
@@ -285,9 +273,9 @@ def run_hook(
|
||||
{
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': '...',
|
||||
'plugin': 'wget',
|
||||
'plugin_hook': 'archivebox/plugins/wget/on_Snapshot__21_wget.py',
|
||||
'output_str': '...',
|
||||
# ... other hook-reported fields
|
||||
},
|
||||
{
|
||||
@@ -325,19 +313,241 @@ def create_model_record(record: dict) -> Any:
|
||||
model_type = record.pop('type')
|
||||
|
||||
if model_type == 'InstalledBinary':
|
||||
obj, created = InstalledBinary.objects.get_or_create(**record)
|
||||
obj, created = InstalledBinary.objects.get_or_create(**record) # if model requires custom logic implement InstalledBinary.from_jsonl(**record)
|
||||
return obj
|
||||
elif model_type == 'Dependency':
|
||||
obj, created = Dependency.objects.get_or_create(**record)
|
||||
return obj
|
||||
# Add more types as needed
|
||||
# ... Snapshot, ArchiveResult, etc. add more types as needed
|
||||
else:
|
||||
raise ValueError(f"Unknown record type: {model_type}")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 4: Update run_hook() Implementation
|
||||
## Phase 4: Plugin Audit & Standardization
|
||||
|
||||
**CRITICAL:** This phase MUST be done FIRST, before updating core code. Do this manually, one plugin at a time. Do NOT batch-update multiple plugins at once. Do NOT skip any plugins or checks.
|
||||
|
||||
**Why First?** Updating plugins to output clean JSONL before changing core code means the transition is safe and incremental. The current run_hook() can continue to work during the plugin updates.
|
||||
|
||||
### 4.1 Install Hook Standardization
|
||||
|
||||
All plugins should follow a consistent pattern for checking and declaring dependencies.
|
||||
|
||||
#### Hook Naming Convention
|
||||
|
||||
**RENAME ALL HOOKS:**
|
||||
- ❌ OLD: `on_Crawl__*_validate_*.{sh,py,js}`
|
||||
- ✅ NEW: `on_Crawl__*_install_*.{sh,py,js}`
|
||||
|
||||
Rationale: "install" is clearer than "validate" for what these hooks actually do.
|
||||
|
||||
#### Standard Install Hook Pattern
|
||||
|
||||
**ALL install hooks MUST follow this pattern:**
|
||||
|
||||
1. ✅ Check if InstalledBinary already exists for the configured binary
|
||||
2. ✅ If NOT found, emit a Dependency JSONL record, with overrides if you need to customize install process
|
||||
3. ❌ NEVER directly call npm, apt, brew, pip, or any package manager
|
||||
4. ✅ Let bin provider plugins handle actual installation
|
||||
|
||||
**Example Standard Pattern:**
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Check for wget binary and emit Dependency if not found.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
def main():
|
||||
# 1. Get configured binary name/path from env
|
||||
binary_path = os.environ.get('WGET_BINARY', 'wget')
|
||||
|
||||
# 2. Check if InstalledBinary exists for this binary
|
||||
# (In practice, this check happens via database query in the actual implementation)
|
||||
# For install hooks, we emit a Dependency that the system will process
|
||||
|
||||
# 3. Emit Dependency JSONL if needed
|
||||
# The bin provider will check InstalledBinary and install if missing
|
||||
dependency = {
|
||||
'type': 'Dependency',
|
||||
'name': 'wget',
|
||||
'bin_name': Path(binary_path).name if '/' in binary_path else binary_path,
|
||||
'providers': ['apt', 'brew', 'pkg'], # Priority order
|
||||
'abspath': binary_path if binary_path.startswith('/') else None,
|
||||
}
|
||||
|
||||
print(json.dumps(dependency))
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
```
|
||||
|
||||
#### Config Variable Handling
|
||||
|
||||
**ALL hooks MUST respect user-configured binary paths:**
|
||||
|
||||
- ✅ Read `XYZ_BINARY` env var (e.g., `WGET_BINARY`, `YTDLP_BINARY`, `CHROME_BINARY`)
|
||||
- ✅ Support absolute paths: `WGET_BINARY=/usr/local/bin/wget2`
|
||||
- ✅ Support bin names: `WGET_BINARY=wget2`
|
||||
- ✅ Check for the CORRECT binary name in InstalledBinary
|
||||
- ✅ If user provides `WGET_BINARY=wget2`, check for `wget2` not `wget`
|
||||
|
||||
**Example Config Handling:**
|
||||
|
||||
```python
|
||||
# Get configured binary (could be path or name)
|
||||
binary_path = os.environ.get('WGET_BINARY', 'wget')
|
||||
|
||||
# Extract just the binary name for InstalledBinary lookup
|
||||
if '/' in binary_path:
|
||||
# Absolute path: /usr/local/bin/wget2 -> wget2
|
||||
bin_name = Path(binary_path).name
|
||||
else:
|
||||
# Just a name: wget2 -> wget2
|
||||
bin_name = binary_path
|
||||
|
||||
# Now check InstalledBinary for bin_name (not hardcoded 'wget')
|
||||
```
|
||||
|
||||
### 4.2 Snapshot Hook Standardization
|
||||
|
||||
All `on_Snapshot__*.*` hooks must follow the output format specified in **Phase 2**. Key points for implementation:
|
||||
|
||||
#### Output Format Requirements
|
||||
|
||||
**CRITICAL Legacy Issues to Fix:**
|
||||
|
||||
1. ❌ **Remove `RESULT_JSON=` prefix** - old hooks use `console.log('RESULT_JSON=' + ...)`
|
||||
2. ❌ **Remove extra output lines** - old hooks print VERSION=, START_TS=, END_TS=, STATUS=, OUTPUT=
|
||||
3. ❌ **Remove `--version` calls** - hooks should NOT run binary version checks
|
||||
4. ✅ **Output clean JSONL only** - exactly ONE line: `console.log(JSON.stringify(result))`
|
||||
|
||||
**Before (WRONG):**
|
||||
```javascript
|
||||
console.log(`VERSION=${version}`);
|
||||
console.log(`START_TS=${startTime.toISOString()}`);
|
||||
console.log(`RESULT_JSON=${JSON.stringify(result)}`);
|
||||
```
|
||||
|
||||
**After (CORRECT):**
|
||||
```javascript
|
||||
console.log(JSON.stringify({type: 'ArchiveResult', status: 'succeeded', output_str: 'Done'}));
|
||||
```
|
||||
|
||||
> **See Phase 2 for complete JSONL format specification and examples.**
|
||||
|
||||
#### Using Configured Binaries
|
||||
|
||||
**ALL on_Snapshot hooks MUST:**
|
||||
|
||||
1. ✅ Read the correct `XYZ_BINARY` env var
|
||||
2. ✅ Use that binary path/name in their commands
|
||||
3. ✅ Pass cmd in JSONL output for binary FK lookup
|
||||
|
||||
**Example:**
|
||||
|
||||
```javascript
|
||||
// ✅ CORRECT - uses env var
|
||||
const wgetBinary = process.env.WGET_BINARY || 'wget';
|
||||
const cmd = [wgetBinary, '-p', '-k', url];
|
||||
|
||||
// Execute command...
|
||||
const result = execSync(cmd.join(' '));
|
||||
|
||||
// Report cmd in output for binary FK
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_str: 'Downloaded page',
|
||||
cmd: cmd, // ✅ Includes configured binary
|
||||
}));
|
||||
```
|
||||
|
||||
```javascript
|
||||
// ❌ WRONG - hardcoded binary name
|
||||
const cmd = ['wget', '-p', '-k', url]; // Ignores WGET_BINARY
|
||||
```
|
||||
|
||||
### 4.3 Per-Plugin Checklist
|
||||
|
||||
**For EACH plugin, verify ALL of these:**
|
||||
|
||||
#### Install Hook Checklist
|
||||
|
||||
- [ ] Renamed from `on_Crawl__*_validate_*` to `on_Crawl__*_install_*`
|
||||
- [ ] Reads `XYZ_BINARY` env var and handles both absolute paths + bin names
|
||||
- [ ] Emits `{"type": "Dependency", ...}` JSONL (NOT hardcoded to always check for 'wget')
|
||||
- [ ] Does NOT call npm/apt/brew/pip directly
|
||||
- [ ] Follows standard pattern from section 4.1
|
||||
|
||||
#### Snapshot Hook Checklist
|
||||
|
||||
- [ ] Reads correct `XYZ_BINARY` env var and uses it in cmd
|
||||
- [ ] Outputs EXACTLY ONE JSONL line (NO `RESULT_JSON=` prefix)
|
||||
- [ ] NO extra output lines (VERSION=, START_TS=, END_TS=, STATUS=, OUTPUT=)
|
||||
- [ ] Does NOT run `--version` commands
|
||||
- [ ] Only provides allowed fields (type, status, output_str, output_json, cmd)
|
||||
- [ ] Does NOT include computed fields (see Phase 2 for forbidden fields list)
|
||||
- [ ] Includes `cmd` array with configured binary path
|
||||
|
||||
### 4.4 Implementation Process
|
||||
|
||||
**MANDATORY PROCESS:**
|
||||
|
||||
1. ✅ List ALL plugins in archivebox/plugins/
|
||||
2. ✅ For EACH plugin (DO NOT BATCH):
|
||||
a. Read ALL hook files in the plugin directory
|
||||
b. Check install hooks against checklist 4.3
|
||||
c. Check snapshot hooks against checklist 4.3
|
||||
d. Fix issues one by one
|
||||
e. Test the plugin hooks
|
||||
f. Move to next plugin
|
||||
3. ❌ DO NOT skip any plugins
|
||||
4. ❌ DO NOT batch-update multiple plugins
|
||||
5. ❌ DO NOT assume plugins are similar enough to update together
|
||||
|
||||
**Why one-by-one?**
|
||||
- Each plugin may have unique patterns
|
||||
- Each plugin may use different languages (sh/py/js)
|
||||
- Each plugin may have different edge cases
|
||||
- Batch updates lead to copy-paste errors
|
||||
|
||||
### 4.5 Testing Each Plugin
|
||||
|
||||
After updating each plugin, verify:
|
||||
|
||||
1. ✅ Install hook can be executed: `python3 on_Crawl__01_install_wget.py`
|
||||
2. ✅ Install hook outputs valid JSONL: `python3 ... | jq .`
|
||||
3. ✅ Install hook respects `XYZ_BINARY` env var
|
||||
4. ✅ Snapshot hook can be executed with test URL
|
||||
5. ✅ Snapshot hook outputs EXACTLY ONE JSONL line
|
||||
6. ✅ Snapshot hook JSONL parses correctly: `... | jq .type`
|
||||
7. ✅ Snapshot hook uses configured binary from env
|
||||
|
||||
### 4.6 Common Pitfalls
|
||||
|
||||
When auditing plugins, watch for these common mistakes:
|
||||
|
||||
1. **Hardcoded binary names** - Check `InstalledBinary.filter(name='wget')` → should use configured name
|
||||
2. **Old output format** - Look for `RESULT_JSON=`, `VERSION=`, `START_TS=` lines
|
||||
3. **Computed fields in output** - Watch for `output_files`, `start_ts`, `duration` in JSONL
|
||||
4. **Missing config variables** - Ensure hooks read `XYZ_BINARY` env vars
|
||||
5. **Version checks** - Remove any `--version` command executions
|
||||
|
||||
> See sections 4.1 and 4.2 for detailed before/after examples.
|
||||
|
||||
---
|
||||
|
||||
## Phase 5: Update run_hook() Implementation
|
||||
|
||||
**Note:** Only do this AFTER Phase 4 (plugin standardization) is complete. By then, all plugins will output clean JSONL and this implementation will work smoothly.
|
||||
|
||||
### Location: `archivebox/hooks.py`
|
||||
|
||||
@@ -546,7 +756,9 @@ def run_hook(
|
||||
|
||||
---
|
||||
|
||||
## Phase 5: Update ArchiveResult.run()
|
||||
## Phase 6: Update ArchiveResult.run()
|
||||
|
||||
**Note:** Only do this AFTER Phase 5 (run_hook() implementation) is complete.
|
||||
|
||||
### Location: `archivebox/core/models.py`
|
||||
|
||||
@@ -562,7 +774,7 @@ def run(self):
|
||||
computed fields (output_files, output_size, binary FK, etc.).
|
||||
"""
|
||||
from django.utils import timezone
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, find_binary_for_cmd, create_model_record
|
||||
from machine.models import Machine
|
||||
|
||||
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
|
||||
@@ -802,9 +1014,47 @@ All existing queries continue to work unchanged - the dict structure is backward
|
||||
|
||||
---
|
||||
|
||||
## Phase 6: Background Hook Finalization
|
||||
## Phase 7: Background Hook Support
|
||||
|
||||
### Helper Functions
|
||||
This phase adds support for long-running background hooks that don't block other extractors.
|
||||
|
||||
### 7.1 Background Hook Detection
|
||||
|
||||
Background hooks are identified by `.bg.` suffix in filename:
|
||||
- `on_Snapshot__21_consolelog.bg.js` ← background
|
||||
- `on_Snapshot__11_favicon.js` ← foreground
|
||||
|
||||
### 7.2 Rename Background Hooks
|
||||
|
||||
**Files to rename:**
|
||||
|
||||
```bash
|
||||
# Use .bg. suffix (not __background)
|
||||
mv archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js \
|
||||
archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js
|
||||
|
||||
mv archivebox/plugins/ssl/on_Snapshot__23_ssl.js \
|
||||
archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js
|
||||
|
||||
mv archivebox/plugins/responses/on_Snapshot__24_responses.js \
|
||||
archivebox/plugins/responses/on_Snapshot__24_responses.bg.js
|
||||
```
|
||||
|
||||
**Update hook content to emit proper JSON:**
|
||||
|
||||
Each hook should emit:
|
||||
```javascript
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded', // or 'failed' or 'skipped'
|
||||
output_str: 'Captured 15 console messages', // human-readable summary
|
||||
output_json: { // optional structured metadata
|
||||
// ... specific to each hook
|
||||
}
|
||||
}));
|
||||
```
|
||||
|
||||
### 7.3 Finalization Helper Functions
|
||||
|
||||
Location: `archivebox/core/models.py` or new `archivebox/core/background_hooks.py`
|
||||
|
||||
@@ -934,7 +1184,7 @@ def finalize_background_hook(archiveresult: 'ArchiveResult') -> None:
|
||||
stderr_file.unlink()
|
||||
```
|
||||
|
||||
### Update SnapshotMachine
|
||||
### 7.4 Update SnapshotMachine
|
||||
|
||||
Location: `archivebox/core/statemachines.py`
|
||||
|
||||
@@ -967,82 +1217,12 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||
return True
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 6b: Deduplication
|
||||
### 7.5 Deduplication
|
||||
|
||||
Deduplication is handled by external filesystem tools like `fdupes` (hardlinks), ZFS dedup, Btrfs duperemove, or rdfind. Users can run these tools periodically on the archive directory to identify and link duplicate files. ArchiveBox doesn't need to track hashes or manage deduplication itself - the filesystem layer handles it transparently.
|
||||
|
||||
---
|
||||
|
||||
## Phase 7: Rename Background Hooks
|
||||
|
||||
### Files to rename:
|
||||
|
||||
```bash
|
||||
# Use .bg. suffix (not __background)
|
||||
mv archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js \
|
||||
archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js
|
||||
|
||||
mv archivebox/plugins/ssl/on_Snapshot__23_ssl.js \
|
||||
archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js
|
||||
|
||||
mv archivebox/plugins/responses/on_Snapshot__24_responses.js \
|
||||
archivebox/plugins/responses/on_Snapshot__24_responses.bg.js
|
||||
```
|
||||
|
||||
### Update hook content to emit proper JSON:
|
||||
|
||||
Each hook should emit:
|
||||
```javascript
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded', // or 'failed' or 'skipped'
|
||||
output_str: 'Captured 15 console messages', // human-readable summary
|
||||
output_json: { // optional structured metadata
|
||||
// ... specific to each hook
|
||||
}
|
||||
}));
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 8: Update Existing Hooks
|
||||
|
||||
### Update all hooks to emit proper JSON format
|
||||
|
||||
**Example: favicon hook**
|
||||
|
||||
```python
|
||||
# Before
|
||||
print(f'Favicon saved ({size} bytes)')
|
||||
print(f'OUTPUT={OUTPUT_FILE}')
|
||||
print(f'STATUS=succeeded')
|
||||
|
||||
# After
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': f'Favicon saved ({size} bytes)',
|
||||
'output_json': {
|
||||
'size': size,
|
||||
'format': 'ico'
|
||||
}
|
||||
}
|
||||
print(json.dumps(result))
|
||||
```
|
||||
|
||||
**Example: wget hook with explicit cmd**
|
||||
|
||||
```bash
|
||||
# After wget completes
|
||||
cat <<EOF
|
||||
{"type": "ArchiveResult", "status": "succeeded", "output_str": "Downloaded index.html", "cmd": ["wget", "-p", "-k", "$URL"]}
|
||||
EOF
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
### 1. Unit Tests
|
||||
@@ -1166,13 +1346,18 @@ cd archivebox
|
||||
python manage.py makemigrations core --name archiveresult_background_hooks
|
||||
```
|
||||
|
||||
### Step 2: Update run_hook()
|
||||
### Step 2: **Plugin standardization (Phase 4)**
|
||||
- Update ALL plugins to new JSONL format FIRST
|
||||
- Test each plugin as you update it
|
||||
- This ensures old run_hook() can still work during transition
|
||||
|
||||
### Step 3: Update run_hook() (Phase 5)
|
||||
- Add background hook detection
|
||||
- Add log file capture
|
||||
- Parse JSONL output (any line with {type: 'ModelName', ...})
|
||||
- Add plugin and plugin_hook metadata to each record
|
||||
|
||||
### Step 3: Update ArchiveResult.run()
|
||||
### Step 4: Update ArchiveResult.run() (Phase 6)
|
||||
- Handle None result for background hooks (return immediately)
|
||||
- Parse records list from run_hook()
|
||||
- Assert only one ArchiveResult record per hook
|
||||
@@ -1180,22 +1365,18 @@ python manage.py makemigrations core --name archiveresult_background_hooks
|
||||
- Call `_populate_output_fields()` to walk directory and populate summary fields
|
||||
- Call `create_model_record()` for any side-effect records (InstalledBinary, etc.)
|
||||
|
||||
### Step 4: Add finalization helpers
|
||||
### Step 5: Add finalization helpers (Phase 7)
|
||||
- `find_background_hooks()`
|
||||
- `check_background_hook_completed()`
|
||||
- `finalize_background_hook()`
|
||||
|
||||
### Step 5: Update SnapshotMachine.is_finished()
|
||||
### Step 6: Update SnapshotMachine.is_finished() (Phase 7)
|
||||
- Check for background hooks
|
||||
- Finalize completed ones
|
||||
|
||||
### Step 6: Rename hooks
|
||||
### Step 7: Rename background hooks (Phase 7)
|
||||
- Rename 3 background hooks with .bg. suffix
|
||||
|
||||
### Step 7: Update hook outputs
|
||||
- Update all hooks to emit JSON format
|
||||
- Remove manual timestamp/status calculation
|
||||
|
||||
### Step 8: Test
|
||||
- Unit tests
|
||||
- Integration tests
|
||||
@@ -1214,6 +1395,8 @@ python manage.py makemigrations core --name archiveresult_background_hooks
|
||||
- ✅ Log files cleaned up on success, kept on failure
|
||||
- ✅ PID files cleaned up after completion
|
||||
- ✅ No plugin-specific code in core (generic polling mechanism)
|
||||
- ✅ All plugins updated to clean JSONL format
|
||||
- ✅ Safe incremental rollout (plugins first, then core code)
|
||||
|
||||
---
|
||||
|
||||
@@ -25,7 +25,8 @@ AptProvider.model_rebuild()
|
||||
@click.option('--bin-name', required=True, help="Binary name to install")
|
||||
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', default=None, help="Custom install command (overrides default)")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
|
||||
@click.option('--overrides', default=None, help="JSON-encoded overrides dict")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None, overrides: str | None):
|
||||
"""Install binary using apt package manager."""
|
||||
|
||||
# Check if apt provider is allowed
|
||||
@@ -42,7 +43,16 @@ def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str
|
||||
click.echo(f"Installing {bin_name} via apt...", err=True)
|
||||
|
||||
try:
|
||||
binary = Binary(name=bin_name, binproviders=[provider]).install()
|
||||
# Parse overrides if provided
|
||||
overrides_dict = None
|
||||
if overrides:
|
||||
try:
|
||||
overrides_dict = json.loads(overrides)
|
||||
click.echo(f"Using custom install overrides: {overrides_dict}", err=True)
|
||||
except json.JSONDecodeError:
|
||||
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
|
||||
|
||||
binary = Binary(name=bin_name, binproviders=[provider], overrides=overrides_dict or {}).install()
|
||||
except Exception as e:
|
||||
click.echo(f"apt install failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
@@ -25,7 +25,8 @@ BrewProvider.model_rebuild()
|
||||
@click.option('--bin-name', required=True, help="Binary name to install")
|
||||
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', default=None, help="Custom install command")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
|
||||
@click.option('--overrides', default=None, help="JSON-encoded overrides dict")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None, overrides: str | None):
|
||||
"""Install binary using Homebrew."""
|
||||
|
||||
if bin_providers != '*' and 'brew' not in bin_providers.split(','):
|
||||
@@ -41,7 +42,16 @@ def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str
|
||||
click.echo(f"Installing {bin_name} via brew...", err=True)
|
||||
|
||||
try:
|
||||
binary = Binary(name=bin_name, binproviders=[provider]).install()
|
||||
# Parse overrides if provided
|
||||
overrides_dict = None
|
||||
if overrides:
|
||||
try:
|
||||
overrides_dict = json.loads(overrides)
|
||||
click.echo(f"Using custom install overrides: {overrides_dict}", err=True)
|
||||
except json.JSONDecodeError:
|
||||
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
|
||||
|
||||
binary = Binary(name=bin_name, binproviders=[provider], overrides=overrides_dict or {}).install()
|
||||
except Exception as e:
|
||||
click.echo(f"brew install failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
@@ -15,42 +15,12 @@ import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, version_flag],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_forumdl() -> dict | None:
|
||||
"""Find forum-dl binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider
|
||||
|
||||
class ForumdlBinary(Binary):
|
||||
name: str = 'forum-dl'
|
||||
binproviders_supported = [PipProvider(), EnvProvider()]
|
||||
|
||||
binary = ForumdlBinary()
|
||||
binary = Binary(name='forum-dl', binproviders=[PipProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -86,7 +56,7 @@ def main():
|
||||
missing_deps = []
|
||||
|
||||
# Emit results for forum-dl
|
||||
if forumdl_result and forumdl_result.get('abspath'):
|
||||
if forumdl_result and forumdl_result.get('abspath') and forumdl_result.get('version'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': forumdl_result['name'],
|
||||
@@ -111,10 +81,19 @@ def main():
|
||||
'value': forumdl_result['version'],
|
||||
}))
|
||||
else:
|
||||
# forum-dl has cchardet dependency that doesn't compile on Python 3.14+
|
||||
# Provide overrides to install with chardet instead
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'forum-dl',
|
||||
'bin_providers': 'pip,env',
|
||||
'overrides': {
|
||||
'pip': {
|
||||
'packages': ['--no-deps', 'forum-dl', 'chardet', 'pydantic', 'beautifulsoup4', 'lxml',
|
||||
'requests', 'urllib3', 'tenacity', 'python-dateutil',
|
||||
'html2text', 'warcio']
|
||||
}
|
||||
}
|
||||
}))
|
||||
missing_deps.append('forum-dl')
|
||||
|
||||
|
||||
@@ -137,6 +137,8 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
return True, None, '' # Not a forum site - success, no output
|
||||
if 'no content' in stderr_lower:
|
||||
return True, None, '' # No forum found - success, no output
|
||||
if 'extractornotfounderror' in stderr_lower:
|
||||
return True, None, '' # No forum extractor for this URL - success, no output
|
||||
if result.returncode == 0:
|
||||
return True, None, '' # forum-dl exited cleanly, just no forum - success
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
@@ -24,6 +25,75 @@ FORUMDL_HOOK = PLUGIN_DIR / 'on_Snapshot__53_forumdl.py'
|
||||
FORUMDL_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_forumdl.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Module-level cache for installed binary path
|
||||
_forumdl_binary_path = None
|
||||
|
||||
def get_forumdl_binary_path():
|
||||
"""Get the installed forum-dl binary path from cache or by running validation/installation."""
|
||||
global _forumdl_binary_path
|
||||
if _forumdl_binary_path:
|
||||
return _forumdl_binary_path
|
||||
|
||||
# Run validation hook to find or install binary
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FORUMDL_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
# Check if binary was found
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary' and record.get('name') == 'forum-dl':
|
||||
_forumdl_binary_path = record.get('abspath')
|
||||
return _forumdl_binary_path
|
||||
elif record.get('type') == 'Dependency' and record.get('bin_name') == 'forum-dl':
|
||||
# Need to install via pip hook
|
||||
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Dependency__install_using_pip_provider.py'
|
||||
dependency_id = str(uuid.uuid4())
|
||||
|
||||
# Build command with overrides if present
|
||||
cmd = [
|
||||
sys.executable, str(pip_hook),
|
||||
'--dependency-id', dependency_id,
|
||||
'--bin-name', record['bin_name']
|
||||
]
|
||||
if 'overrides' in record:
|
||||
cmd.extend(['--overrides', json.dumps(record['overrides'])])
|
||||
|
||||
install_result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
# Parse InstalledBinary from pip installation
|
||||
for install_line in install_result.stdout.strip().split('\n'):
|
||||
if install_line.strip():
|
||||
try:
|
||||
install_record = json.loads(install_line)
|
||||
if install_record.get('type') == 'InstalledBinary' and install_record.get('name') == 'forum-dl':
|
||||
_forumdl_binary_path = install_record.get('abspath')
|
||||
return _forumdl_binary_path
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Installation failed - print debug info
|
||||
if not _forumdl_binary_path:
|
||||
print(f"\n=== forum-dl installation failed ===", file=sys.stderr)
|
||||
print(f"stdout: {install_result.stdout}", file=sys.stderr)
|
||||
print(f"stderr: {install_result.stderr}", file=sys.stderr)
|
||||
print(f"returncode: {install_result.returncode}", file=sys.stderr)
|
||||
return None
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert FORUMDL_HOOK.exists(), f"Hook not found: {FORUMDL_HOOK}"
|
||||
@@ -64,38 +134,40 @@ def test_forumdl_validate_hook():
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify forum-dl is available via abx-pkg."""
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
missing_binaries = []
|
||||
|
||||
# Verify forum-dl is available
|
||||
forumdl_binary = Binary(name='forum-dl', binproviders=[PipProvider(), EnvProvider()])
|
||||
forumdl_loaded = forumdl_binary.load()
|
||||
if not (forumdl_loaded and forumdl_loaded.abspath):
|
||||
missing_binaries.append('forum-dl')
|
||||
|
||||
if missing_binaries:
|
||||
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
|
||||
"""Verify forum-dl is installed by calling the REAL validation and installation hooks."""
|
||||
binary_path = get_forumdl_binary_path()
|
||||
assert binary_path, (
|
||||
"forum-dl must be installed successfully via validation hook and pip provider. "
|
||||
"NOTE: forum-dl has a dependency on cchardet which does not compile on Python 3.14+ "
|
||||
"due to removed longintrepr.h header. This is a known compatibility issue with forum-dl."
|
||||
)
|
||||
assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}"
|
||||
|
||||
|
||||
def test_handles_non_forum_url():
|
||||
"""Test that forum-dl extractor handles non-forum URLs gracefully via hook."""
|
||||
# Prerequisites checked by earlier test
|
||||
import os
|
||||
|
||||
binary_path = get_forumdl_binary_path()
|
||||
assert binary_path, "Binary must be installed for this test"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
env = os.environ.copy()
|
||||
env['FORUMDL_BINARY'] = binary_path
|
||||
|
||||
# Run forum-dl extraction hook on non-forum URL
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should exit 0 even for non-forum URL
|
||||
# Should exit 0 even for non-forum URL (graceful handling)
|
||||
assert result.returncode == 0, f"Should handle non-forum URL gracefully: {result.stderr}"
|
||||
|
||||
# Verify JSONL output
|
||||
@@ -138,8 +210,12 @@ def test_config_timeout():
|
||||
"""Test that FORUMDL_TIMEOUT config is respected."""
|
||||
import os
|
||||
|
||||
binary_path = get_forumdl_binary_path()
|
||||
assert binary_path, "Binary must be installed for this test"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['FORUMDL_BINARY'] = binary_path
|
||||
env['FORUMDL_TIMEOUT'] = '5'
|
||||
|
||||
result = subprocess.run(
|
||||
|
||||
@@ -15,42 +15,12 @@ import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, version_flag],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_ytdlp() -> dict | None:
|
||||
"""Find yt-dlp binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider
|
||||
from abx_pkg import Binary, PipProvider, BrewProvider, AptProvider, EnvProvider
|
||||
|
||||
class YtdlpBinary(Binary):
|
||||
name: str = 'yt-dlp'
|
||||
binproviders_supported = [PipProvider(), EnvProvider()]
|
||||
|
||||
binary = YtdlpBinary()
|
||||
binary = Binary(name='yt-dlp', binproviders=[PipProvider(), BrewProvider(), AptProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -71,8 +41,8 @@ def find_ytdlp() -> dict | None:
|
||||
return {
|
||||
'name': 'yt-dlp',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'version': None,
|
||||
'sha256': None,
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
@@ -84,12 +54,7 @@ def find_node() -> dict | None:
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
class NodeBinary(Binary):
|
||||
name: str = 'node'
|
||||
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
|
||||
overrides: dict = {'apt': {'packages': ['nodejs']}}
|
||||
|
||||
binary = NodeBinary()
|
||||
binary = Binary(name='node', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -110,8 +75,8 @@ def find_node() -> dict | None:
|
||||
return {
|
||||
'name': 'node',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'version': None,
|
||||
'sha256': None,
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
@@ -123,11 +88,7 @@ def find_ffmpeg() -> dict | None:
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
class FfmpegBinary(Binary):
|
||||
name: str = 'ffmpeg'
|
||||
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
|
||||
|
||||
binary = FfmpegBinary()
|
||||
binary = Binary(name='ffmpeg', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -148,8 +109,8 @@ def find_ffmpeg() -> dict | None:
|
||||
return {
|
||||
'name': 'ffmpeg',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'version': None,
|
||||
'sha256': None,
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
@@ -197,7 +158,7 @@ def main():
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'yt-dlp',
|
||||
'bin_providers': 'pip,env',
|
||||
'bin_providers': 'pip,brew,apt,env',
|
||||
}))
|
||||
missing_deps.append('yt-dlp')
|
||||
|
||||
@@ -227,10 +188,14 @@ def main():
|
||||
'value': node_result['version'],
|
||||
}))
|
||||
else:
|
||||
# node is installed as 'nodejs' package on apt
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'node',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
'overrides': {
|
||||
'apt': {'packages': ['nodejs']}
|
||||
}
|
||||
}))
|
||||
missing_deps.append('node')
|
||||
|
||||
|
||||
@@ -15,43 +15,12 @@ import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_mercury() -> dict | None:
|
||||
"""Find postlight-parser binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider
|
||||
|
||||
class MercuryBinary(Binary):
|
||||
name: str = 'postlight-parser'
|
||||
binproviders_supported = [NpmProvider(), EnvProvider()]
|
||||
overrides: dict = {'npm': {'packages': ['@postlight/parser']}}
|
||||
|
||||
binary = MercuryBinary()
|
||||
binary = Binary(name='postlight-parser', binproviders=[NpmProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -72,8 +41,8 @@ def find_mercury() -> dict | None:
|
||||
return {
|
||||
'name': 'postlight-parser',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'version': None,
|
||||
'sha256': None,
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
@@ -110,10 +79,14 @@ def main():
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
# postlight-parser is installed as @postlight/parser in npm
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'postlight-parser',
|
||||
'bin_providers': 'npm,env',
|
||||
'overrides': {
|
||||
'npm': {'packages': ['@postlight/parser']}
|
||||
}
|
||||
}))
|
||||
print(f"postlight-parser binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
@@ -25,7 +25,8 @@ NpmProvider.model_rebuild()
|
||||
@click.option('--bin-name', required=True, help="Binary name to install")
|
||||
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', default=None, help="Custom install command")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
|
||||
@click.option('--overrides', default=None, help="JSON-encoded overrides dict")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None, overrides: str | None):
|
||||
"""Install binary using npm."""
|
||||
|
||||
if bin_providers != '*' and 'npm' not in bin_providers.split(','):
|
||||
@@ -41,7 +42,16 @@ def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str
|
||||
click.echo(f"Installing {bin_name} via npm...", err=True)
|
||||
|
||||
try:
|
||||
binary = Binary(name=bin_name, binproviders=[provider]).install()
|
||||
# Parse overrides if provided
|
||||
overrides_dict = None
|
||||
if overrides:
|
||||
try:
|
||||
overrides_dict = json.loads(overrides)
|
||||
click.echo(f"Using custom install overrides: {overrides_dict}", err=True)
|
||||
except json.JSONDecodeError:
|
||||
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
|
||||
|
||||
binary = Binary(name=bin_name, binproviders=[provider], overrides=overrides_dict or {}).install()
|
||||
except Exception as e:
|
||||
click.echo(f"npm install failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
@@ -15,6 +15,7 @@ import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
@@ -24,6 +25,67 @@ PAPERSDL_HOOK = PLUGIN_DIR / 'on_Snapshot__54_papersdl.py'
|
||||
PAPERSDL_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_papersdl.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Module-level cache for installed binary path
|
||||
_papersdl_binary_path = None
|
||||
|
||||
def get_papersdl_binary_path():
|
||||
"""Get the installed papers-dl binary path from cache or by running validation/installation."""
|
||||
global _papersdl_binary_path
|
||||
if _papersdl_binary_path:
|
||||
return _papersdl_binary_path
|
||||
|
||||
# Run validation hook to find or install binary
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(PAPERSDL_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
# Check if binary was found
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary' and record.get('name') == 'papers-dl':
|
||||
_papersdl_binary_path = record.get('abspath')
|
||||
return _papersdl_binary_path
|
||||
elif record.get('type') == 'Dependency' and record.get('bin_name') == 'papers-dl':
|
||||
# Need to install via pip hook
|
||||
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Dependency__install_using_pip_provider.py'
|
||||
dependency_id = str(uuid.uuid4())
|
||||
|
||||
# Build command with overrides if present
|
||||
cmd = [
|
||||
sys.executable, str(pip_hook),
|
||||
'--dependency-id', dependency_id,
|
||||
'--bin-name', record['bin_name']
|
||||
]
|
||||
if 'overrides' in record:
|
||||
cmd.extend(['--overrides', json.dumps(record['overrides'])])
|
||||
|
||||
install_result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
# Parse InstalledBinary from pip installation
|
||||
for install_line in install_result.stdout.strip().split('\n'):
|
||||
if install_line.strip():
|
||||
try:
|
||||
install_record = json.loads(install_line)
|
||||
if install_record.get('type') == 'InstalledBinary' and install_record.get('name') == 'papers-dl':
|
||||
_papersdl_binary_path = install_record.get('abspath')
|
||||
return _papersdl_binary_path
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert PAPERSDL_HOOK.exists(), f"Hook not found: {PAPERSDL_HOOK}"
|
||||
@@ -64,34 +126,32 @@ def test_papersdl_validate_hook():
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify papers-dl is available via abx-pkg."""
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
missing_binaries = []
|
||||
|
||||
# Verify papers-dl is available
|
||||
papersdl_binary = Binary(name='papers-dl', binproviders=[PipProvider(), EnvProvider()])
|
||||
papersdl_loaded = papersdl_binary.load()
|
||||
if not (papersdl_loaded and papersdl_loaded.abspath):
|
||||
missing_binaries.append('papers-dl')
|
||||
|
||||
if missing_binaries:
|
||||
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
|
||||
"""Verify papers-dl is installed by calling the REAL validation and installation hooks."""
|
||||
binary_path = get_papersdl_binary_path()
|
||||
assert binary_path, "papers-dl must be installed successfully via validation hook and pip provider"
|
||||
assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}"
|
||||
|
||||
|
||||
def test_handles_non_paper_url():
|
||||
"""Test that papers-dl extractor handles non-paper URLs gracefully via hook."""
|
||||
# Prerequisites checked by earlier test
|
||||
import os
|
||||
|
||||
binary_path = get_papersdl_binary_path()
|
||||
assert binary_path, "Binary must be installed for this test"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
env = os.environ.copy()
|
||||
env['PAPERSDL_BINARY'] = binary_path
|
||||
|
||||
# Run papers-dl extraction hook on non-paper URL
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(PAPERSDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
@@ -138,8 +198,12 @@ def test_config_timeout():
|
||||
"""Test that PAPERSDL_TIMEOUT config is respected."""
|
||||
import os
|
||||
|
||||
binary_path = get_papersdl_binary_path()
|
||||
assert binary_path, "Binary must be installed for this test"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['PAPERSDL_BINARY'] = binary_path
|
||||
env['PAPERSDL_TIMEOUT'] = '5'
|
||||
|
||||
result = subprocess.run(
|
||||
|
||||
@@ -25,7 +25,8 @@ PipProvider.model_rebuild()
|
||||
@click.option('--bin-name', required=True, help="Binary name to install")
|
||||
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', default=None, help="Custom install command")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
|
||||
@click.option('--overrides', default=None, help="JSON-encoded overrides dict")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None, overrides: str | None):
|
||||
"""Install binary using pip."""
|
||||
|
||||
if bin_providers != '*' and 'pip' not in bin_providers.split(','):
|
||||
@@ -41,7 +42,16 @@ def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str
|
||||
click.echo(f"Installing {bin_name} via pip...", err=True)
|
||||
|
||||
try:
|
||||
binary = Binary(name=bin_name, binproviders=[provider]).install()
|
||||
# Parse overrides if provided
|
||||
overrides_dict = None
|
||||
if overrides:
|
||||
try:
|
||||
overrides_dict = json.loads(overrides)
|
||||
click.echo(f"Using custom install overrides: {overrides_dict}", err=True)
|
||||
except json.JSONDecodeError:
|
||||
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
|
||||
|
||||
binary = Binary(name=bin_name, binproviders=[provider], overrides=overrides_dict or {}).install()
|
||||
except Exception as e:
|
||||
click.echo(f"pip install failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
@@ -15,43 +15,12 @@ import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_readability() -> dict | None:
|
||||
"""Find readability-extractor binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider
|
||||
|
||||
class ReadabilityBinary(Binary):
|
||||
name: str = 'readability-extractor'
|
||||
binproviders_supported = [NpmProvider(), EnvProvider()]
|
||||
overrides: dict = {'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
|
||||
|
||||
binary = ReadabilityBinary()
|
||||
binary = Binary(name='readability-extractor', binproviders=[NpmProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -72,8 +41,8 @@ def find_readability() -> dict | None:
|
||||
return {
|
||||
'name': 'readability-extractor',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'version': None,
|
||||
'sha256': None,
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
@@ -110,10 +79,14 @@ def main():
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
# readability-extractor is installed from GitHub
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'readability-extractor',
|
||||
'bin_providers': 'npm,env',
|
||||
'overrides': {
|
||||
'npm': {'packages': ['github:ArchiveBox/readability-extractor']}
|
||||
}
|
||||
}))
|
||||
print(f"readability-extractor binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
Reference in New Issue
Block a user