I have found out that I was wrong, and would give an infinite loop, so I have changed the code around to avoid that
from bs4 import BeautifulSoup
import requests
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
})
def parse_and_scrape(url, visitedLinks, filename="links.txt", max_links=100000, depth=0, max_depth=10):
if depth > max_depth:
return
try:
# Check if the link has been visited
if url in visitedLinks or url == '':
return
resp = session.get(url)
soup = BeautifulSoup(resp.content, "html.parser")
links = []
counter = 0
for link in soup.find_all('a'):
href = link.get('href')
if href and href.startswith("https://") and counter < max_links:
links.append(href)
print(href)
counter += 1
visitedLinks.add(url)
# Write new links to the file and update visitedLinks set
with open(filename, 'a') as file:
for link in links:
if link not in visitedLinks:
file.write(link + '\n')
# Recursively scrape the new link
print("next: " + link)
parse_and_scrape(link, visitedLinks, filename, max_links, depth+1, max_depth)
except requests.exceptions.RequestException as e:
print("Failed to open: " + url + " | " + str(e))
# Initialize visited links set
visitedLinks = set()
# Read existing links from the file
try:
with open("links.txt", 'r') as file:
visitedLinks.update(set(file.read().splitlines()))
except FileNotFoundError:
pass
# Example usage
parse_and_scrape("https://cnn.com", visitedLinks,max_links=10)