-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwacz_validator.py
executable file
·151 lines (120 loc) · 5.21 KB
/
wacz_validator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Validate URLs against crawl data from a WACZ file.
This script compares a list of target URLs (provided in a plain text file)
against URLs recorded in the `pages.jsonl` and `extraPages.jsonl` files
stored in a WACZ archive. It identifies:
- Matching URLs found in the WACZ file,
- Missing URLs not present in the WACZ file,
- URLs with a status code other than 200.
The WACZ archive must follow the standard structure where `pages.jsonl` and
`extraPages.jsonl` are located under the `pages/` directory.
Usage:
pages_json_log_validate.py url_list.txt archive.wacz
Arguments:
url_list: Path to the plain text file containing target URLs (one per line).
wacz_file: Path to the WACZ file to be analyzed.
Output:
- Matching URLs and their count.
- Missing URLs and their count.
- URLs with non-200 status codes, their count, and their corresponding status.
"""
import argparse
import json
import zipfile
def read_urls_from_file(file_path):
"""Reads target URLs from a file and returns them as a set."""
try:
with open(file_path, 'r') as file:
return set(line.strip() for line in file)
except IOError as e:
print(f"Error reading file {file_path}: {e}")
return set()
def load_jsonl_from_wacz(wacz_path, jsonl_name):
"""
Extracts URL and status information from a JSONL file within a WACZ archive.
Args:
wacz_path (str): Path to the WACZ file.
jsonl_name (str): Name of the JSONL file (e.g., 'pages.jsonl') to extract.
Returns:
dict: A dictionary where keys are URLs and values are status codes.
"""
url_data = {}
try:
with zipfile.ZipFile(wacz_path, 'r') as wacz:
# Look for the JSONL file in the 'pages/' directory of the WACZ archive
jsonl_path = f"pages/{jsonl_name}"
if jsonl_path not in wacz.namelist():
print(f"File {jsonl_name} not found in WACZ archive {wacz_path}.")
return url_data
with wacz.open(jsonl_path) as jsonl_file:
for line_number, line in enumerate(jsonl_file, start=1):
try:
json_line = json.loads(line)
url = json_line.get('url') # Extract URL
status = json_line.get('status', None) # Extract status code
if url:
url_data[url] = status
except json.JSONDecodeError as e:
print(f"Error decoding JSON on line {line_number} of {jsonl_name}: {e}")
except IOError as e:
print(f"Error reading WACZ file {wacz_path}: {e}")
return url_data
def compare_and_print_results(url_list, crawled_data):
"""
Compares the target URLs against crawled data and prints results.
Args:
url_list (set): Set of target URLs from the input file.
crawled_data (dict): Dictionary of crawled URLs and their statuses.
Output:
Prints the following information:
- Count and list of matching URLs.
- Count and list of missing URLs.
- Count and list of URLs with non-200 statuses, including their status codes.
"""
matching_urls = url_list.intersection(crawled_data.keys())
missing_urls = url_list.difference(crawled_data.keys())
non_200_status_urls = {url: status for url, status in crawled_data.items() if url in url_list and status != 200}
# Print results with counts
print(f"Matching URLs ({len(matching_urls)}):")
if matching_urls:
for url in matching_urls:
print(url)
else:
print("No matching URLs found.")
print(f"\nMissing URLs ({len(missing_urls)}):")
if missing_urls:
for url in missing_urls:
print(url)
else:
print("No missing URLs.")
print(f"\nURLs with non-200 statuses ({len(non_200_status_urls)}):")
if non_200_status_urls:
for url, status in non_200_status_urls.items():
print(f"{url} - Status: {status}")
else:
print("All matching URLs have a status of 200.")
def main():
"""
Main entry point for the script.
- Parses command-line arguments for the URL list and WACZ file paths.
- Reads the target URLs from the input file.
- Loads crawled data from `pages.jsonl` and `extraPages.jsonl` in the WACZ file.
- Compares the target URLs against crawled data and prints results.
"""
parser = argparse.ArgumentParser(description='Validate URLs against crawl data in a WACZ file.')
parser.add_argument('url_list', help='Path to the URL list file')
parser.add_argument('wacz_file', help='Path to the WACZ file containing pages.jsonl and extraPages.jsonl')
args = parser.parse_args()
# Read target URLs
url_list = read_urls_from_file(args.url_list)
# Load crawled data from pages.jsonl and extraPages.jsonl
pages_data = load_jsonl_from_wacz(args.wacz_file, 'pages.jsonl')
extra_pages_data = load_jsonl_from_wacz(args.wacz_file, 'extraPages.jsonl')
# Combine data from both files
all_crawled_data = {**pages_data, **extra_pages_data}
# Compare and print results
compare_and_print_results(url_list, all_crawled_data)
if __name__ == '__main__':
main()