import pdfkit
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def save_page_to_pdf(url, page_number):
options = {
'quiet': ''
}
pdfkit.from_url(url, f'output_page_{page_number}.pdf', options=options)
def crawl_and_save(base_url, page_number, visited_urls=set()):
if base_url in visited_urls:
print(f"Already visited: {base_url}")
return
visited_urls.add(base_url)
print(f"Processing: {base_url}")
save_page_to_pdf(base_url, page_number)
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a', href=True):
absolute_url = urljoin(base_url, link['href'])
crawl_and_save(absolute_url, page_number + 1, visited_urls)
if __name__ == "__main__":
base_url = "your_base_url"
# Make sure to set the correct path to wkhtmltopdf executable
pdfkit_config = pdfkit.configuration(wkhtmltopdf='/path/to/wkhtmltopdf')
crawl_and_save(base_url, page_number=1)
//https://wkhtmltopdf.org/downloads.html
pdfkit_config = pdfkit.configuration(wkhtmltopdf=r'"C:\path with spaces\wkhtmltopdf.exe"')