Using Python To Access archive.today, July 2025

It seems like a lot of the previous software wrappers to interact with archive.today (and archive.is, archive.ph, etc) via the command-line are either outdated or broken. So, here's a Python script to automatically submit links from the command-line to archive.today and retrieve their archived URLs.

From testing, it seems like it's best to keep the delay around 8 to 10 seconds. If you go too fast, Cloudflare will begin to yell at you and start throwing 429 errors.

As long as you've received a "WIP" URL from archive.today, it should be archived shortly after, though it may not appear immediately.

Add your own random user-agent. :)

'''

% python3 archiveToday.py --help
usage: archiveToday.py [-h] --urls URLS [--delay DELAY] [--output OUTPUT]
Batch archive URLs with archive.today
options:
  -h, --help       show this help message and exit
  --urls URLS      Path to file containing URLs (one per line)
  --delay DELAY    Delay between submissions in seconds
  --output OUTPUT  CSV file to save results
  
'''


import requests
import time
import os
import argparse
import csv
import re
from bs4 import BeautifulSoup  

def archive_url(session, url):
    try:
        print(f"Archiving: {url}")
        resp = session.get("https://archive.ph/submit/", params={"url": url}, allow_redirects=False)

        # If already archived, follow 302 redirect
        if resp.status_code == 302:
            archived_url = resp.headers.get("Location")
            # Match both 4 and 5 character archive slugs
            match = re.match(r"(https://archive\.ph/\w{4,5})", archived_url)
            if match:
                archived_url = match.group(1)
            print(f"Already archived: {archived_url}")
            return url, archived_url

        # If needs archiving, follow refresh to /wip/
        if resp.status_code == 200:
            refresh_header = resp.headers.get("refresh", "")
            match = re.search(r'url=(https?://[^\s]+)', refresh_header)
            if not match:
                print("WIP URL not found in refresh header.")
                return url, None

            wip_url = match.group(1)
            print(f"Archiving in progress (WIP): {wip_url}")

            final_resp = session.get(wip_url, allow_redirects=True)
            if final_resp.status_code == 200:
                archived_url = final_resp.url.replace("/wip/", "/")
                print(f"Archived: {archived_url}")
                return url, archived_url
            else:
                print(f"Failed to retrieve from WIP URL. Status: {final_resp.status_code}")
                return url, None

        print(f"Unexpected status code: {resp.status_code}")
        return url, None

    except Exception as e:
        print(f"Exception archiving {url}: {e}")
        return url, None

def read_urls_from_file(filename):
    if not os.path.exists(filename):
        print(f"File not found: {filename}")
        return []

    with open(filename, 'r') as f:
        urls = [
            line.strip()
            for line in f
            if line.strip() and not line.strip().startswith("#")
        ]
    return urls


def main():
    parser = argparse.ArgumentParser(description="Batch archive URLs with archive.today")
    parser.add_argument("--urls", required=True, help="Path to file containing URLs (one per line)")
    parser.add_argument("--delay", type=int, default=30, help="Delay between submissions in seconds")
    parser.add_argument("--output", default="archived_results.csv", help="CSV file to save results")
    args = parser.parse_args()
    urls = read_urls_from_file(args.urls)

    if not urls:
        print("No URLs to archive.")
        return

    session = requests.Session()
    session.headers.update({
        "User-Agent": ""
    })

    with open(args.output, "w", newline='', encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Original URL", "Archived URL"])

        for idx, url in enumerate(urls, 1):
            print(f"\n[{idx}/{len(urls)}]")
            original, archived = archive_url(session, url)
            writer.writerow([original, archived or ""])
            if idx < len(urls):
                print(f"Waiting {args.delay} seconds before next...")
                time.sleep(args.delay)

    print(f"\nFinished. Results saved to {args.output}")

if __name__ == "__main__":
    main()

Stephan Bridger

Search This Blog

Using Python To Access archive.today, July 2025

Labels

Comments

Post a Comment

Popular posts from this blog

Latin1 vs UTF8

Too much efficiency makes everything worse

yt-dlp Archiving, Improved