Robots.txt | Apify & Crawlee | Page 1

sharp vale Jul 21, 2024, 8:16 PM

#

Hey, do you have any idea how to respect robots.txt? We must code that ourself?

sharp vale Jul 22, 2024, 9:17 AM

#

""" 
Class to respect robot.txt file
"""

import urllib.parse

import aiohttp
from protego import Protego


class RobotTXT:
    """Class to respect robot.txt file"""

    def __init__(self):
        self._robots = {}
        self._user_agent = ["*", "GPTBot", "WaveAICrawler"]

    async def __call__(self, url: str) -> bool:
        """Check if the url is allowed to be crawled

        Args:
            url (str): url to be checked

        Returns:
            bool: True if the url is allowed to be crawled, False otherwise
        """

        url_parse = urllib.parse.urlparse(url)
        robots_url = f"{url_parse.scheme}://{url_parse.netloc}/robots.txt"

        if robots_url not in self._robots:
            async with aiohttp.ClientSession() as session:
                async with session.get(robots_url) as response:
                    robots_content = await response.text()
                    self._robots[robots_url] = Protego.parse(robots_content)

        authorize = []
        for agent in self._user_agent:
            authorize.append(self._robots[robots_url].can_fetch(url, agent))

        if len(self._robots) > 1000:
            self._robots.popitem(last=False)

        return all(authorize)

#

I have made this scripts if you want

sharp vale Jul 22, 2024, 4:15 PM

#

and to use it:

#

robots_parser = RobotTXT()
authorized = await robots_parser(url)

#Robots.txt