Source code for dirhunt.sources.robots

from itertools import chain

import requests
from requests import RequestException

from dirhunt.sources.base import Source
from dirhunt._compat import RobotFileParser, URLError


[docs]def get_url(protocol, domain, path): path = path.lstrip('/') return '{protocol}://{domain}/{path}'.format(**locals())
[docs]class DirhuntRobotFileParser(RobotFileParser):
[docs] def read(self): """Reads the robots.txt URL and feeds it to the parser.""" try: with requests.get(self.url) as response: status_code = response.status_code text = response.text except RequestException: pass else: if status_code in (401, 403): self.disallow_all = True elif status_code >= 400 and status_code < 500: self.allow_all = True self.parse(text.splitlines())
[docs]class Robots(Source):
[docs] def callback(self, domain, protocol='http'): rp = DirhuntRobotFileParser() rp.set_url(get_url(protocol, domain, 'robots.txt')) try: rp.read() except (IOError, URLError): if protocol == 'http': self.callback(domain, 'https') return entries = list(rp.entries) if rp.default_entry: entries.append(rp.default_entry) for ruleline in chain(*[entry.rulelines for entry in entries]): self.add_result(get_url(protocol, domain, ruleline.path))