[ot][spam]gpt-4 completions
Undescribed Horrific Abuse, One Victim & Survivor of Many
gmkarl at gmail.com
Fri Mar 17 04:35:03 PDT 2023
2023-03-17 07:33:22 {
"input": "Write a complete python script to maintain a mirror of a
website as a git repository, crawling the domain like wget’s spider
behavior; update every page each run:"
} ```python
import os
import sys
import time
import hashlib
import requests
from bs4 import BeautifulSoup
from git import Repo
from urllib.parse import urlparse, urljoin
# Configuration
website_url = "https://example.com"
output_directory = "website_mirror"
git_repository = "website_mirror.git"
crawl_delay = 1 # in seconds
visited_urls = set()
def is_valid_url(url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_links(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
urls = set()
for link in soup.find_all("a"):
href = link.get("href")
if href and not href.startswith("#"):
full_url = urljoin(url, href)
if is_valid_url(full_url):
urls.add(full_url)
return urls
def save_page(url, content):
parsed_url = urlparse(url)
path = os.path.join(output_directory, parsed_url.netloc,
parsed_url.path.lstrip("/"))
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "wb") as f:
f.write(content)
def crawl(url):
if url in visited_urls:
return
print(f"Crawling: {url}")
visited_urls.add(url)
try:
response = requests.get(url)
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
return
save_page(url, response.content)
for link in get_all_links(url):
if urlparse(link).netloc == urlparse(website_url).netloc:
crawl(link)
time.sleep(crawl_delay)
def main():
if not os.path.exists(git_repository):
print(f"Initializing git repository: {git_repository}")
Repo.init(git_repository)
repo = Repo(git_repository)
os.chdir(git_repository)
print("Crawling website...")
crawl(website_url)
if repo.is_dirty():
print("Committing changes to git repository...")
repo.git.add(A=True)
repo.git.commit(m="Update website mirror")
else:
print("No changes detected.")
if __name__ == "__main__":
main()
``` 32273 ms
More information about the cypherpunks
mailing list