#This program isnt recursivly running through the link list im not sure why

8 messages · Page 1 of 1 (latest)

molten crowBOT
#

@haughty linden

File Attachments Not Allowed

For safety reasons we do not allow files with certain file extensions.

Code Formatting

You can share your code using triple backticks like this:
```
YOUR CODE
```

Large Portions of Code

For longer scripts use Hastebin or GitHub Gists and share the link here

Ignored these files due to them having disallowed file extensions
ocean leaf
#

@haughty linden post code like:
```
code here
```

haughty linden
hoary atlas
#

So move adding to the linked list after the function call

#

I have found out that I was wrong, and would give an infinite loop, so I have changed the code around to avoid that

from bs4 import BeautifulSoup
import requests

session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
})

def parse_and_scrape(url, visitedLinks, filename="links.txt", max_links=100000, depth=0, max_depth=10):
    if depth > max_depth:
        return

    try:
        # Check if the link has been visited
        if url in visitedLinks or url == '':
            return

        resp = session.get(url)
        soup = BeautifulSoup(resp.content, "html.parser")
        links = []
        counter = 0

        for link in soup.find_all('a'):
            href = link.get('href')
            if href and href.startswith("https://") and counter < max_links:
                links.append(href)
                print(href)
                counter += 1

        visitedLinks.add(url)

        # Write new links to the file and update visitedLinks set
        with open(filename, 'a') as file:
            for link in links:
                if link not in visitedLinks:
                    file.write(link + '\n')
                    # Recursively scrape the new link
                    print("next: " + link)
                    parse_and_scrape(link, visitedLinks, filename, max_links, depth+1, max_depth)
    except requests.exceptions.RequestException as e:
        print("Failed to open: " + url + " | " + str(e))

# Initialize visited links set
visitedLinks = set()
# Read existing links from the file
try:
    with open("links.txt", 'r') as file:
        visitedLinks.update(set(file.read().splitlines()))
except FileNotFoundError:
    pass

# Example usage
parse_and_scrape("https://cnn.com", visitedLinks,max_links=10)
molten crowBOT