It seems like a lot of the previous software wrappers to interact with archive.today (and archive.is, archive.ph, etc) via the command-line are either outdated or broken. So, here's a Python script to automatically submit links from the command-line to archive.today and retrieve their archived URLs.
From testing, it seems like it's best to keep the delay around 8 to 10 seconds. If you go too fast, Cloudflare will begin to yell at you and start throwing 429 errors.
As long as you've received a "WIP" URL from archive.today, it should be archived shortly after, though it may not appear immediately.
Add your own random user-agent. :)
'''
% python3 archiveToday.py --help
usage: archiveToday.py [-h] --urls URLS [--delay DELAY] [--output OUTPUT]
Batch archive URLs with archive.today
options:
-h, --help show this help message and exit
--urls URLS Path to file containing URLs (one per line)
--delay DELAY Delay between submissions in seconds
--output OUTPUT CSV file to save results
'''
import requests
import time
import os
import argparse
import csv
import re
from bs4 import BeautifulSoup
def archive_url(session, url):
try:
print(f"Archiving: {url}")
resp = session.get("https://archive.ph/submit/", params={"url": url}, allow_redirects=False)
# If already archived, follow 302 redirect
if resp.status_code == 302:
archived_url = resp.headers.get("Location")
# Match both 4 and 5 character archive slugs
match = re.match(r"(https://archive\.ph/\w{4,5})", archived_url)
if match:
archived_url = match.group(1)
print(f"Already archived: {archived_url}")
return url, archived_url
# If needs archiving, follow refresh to /wip/
if resp.status_code == 200:
refresh_header = resp.headers.get("refresh", "")
match = re.search(r'url=(https?://[^\s]+)', refresh_header)
if not match:
print("WIP URL not found in refresh header.")
return url, None
wip_url = match.group(1)
print(f"Archiving in progress (WIP): {wip_url}")
final_resp = session.get(wip_url, allow_redirects=True)
if final_resp.status_code == 200:
archived_url = final_resp.url.replace("/wip/", "/")
print(f"Archived: {archived_url}")
return url, archived_url
else:
print(f"Failed to retrieve from WIP URL. Status: {final_resp.status_code}")
return url, None
print(f"Unexpected status code: {resp.status_code}")
return url, None
except Exception as e:
print(f"Exception archiving {url}: {e}")
return url, None
def read_urls_from_file(filename):
if not os.path.exists(filename):
print(f"File not found: {filename}")
return []
with open(filename, 'r') as f:
urls = [
line.strip()
for line in f
if line.strip() and not line.strip().startswith("#")
]
return urls
def main():
parser = argparse.ArgumentParser(description="Batch archive URLs with archive.today")
parser.add_argument("--urls", required=True, help="Path to file containing URLs (one per line)")
parser.add_argument("--delay", type=int, default=30, help="Delay between submissions in seconds")
parser.add_argument("--output", default="archived_results.csv", help="CSV file to save results")
args = parser.parse_args()
urls = read_urls_from_file(args.urls)
if not urls:
print("No URLs to archive.")
return
session = requests.Session()
session.headers.update({
"User-Agent": ""
})
with open(args.output, "w", newline='', encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["Original URL", "Archived URL"])
for idx, url in enumerate(urls, 1):
print(f"\n[{idx}/{len(urls)}]")
original, archived = archive_url(session, url)
writer.writerow([original, archived or ""])
if idx < len(urls):
print(f"Waiting {args.delay} seconds before next...")
time.sleep(args.delay)
print(f"\nFinished. Results saved to {args.output}")
if __name__ == "__main__":
main()
Comments
Post a Comment