Source code for dirhunt.sources.robots
from itertools import chain
import requests
from requests import RequestException
from dirhunt.sources.base import Source
from dirhunt._compat import RobotFileParser, URLError
[docs]def get_url(protocol, domain, path):
path = path.lstrip('/')
return '{protocol}://{domain}/{path}'.format(**locals())
[docs]class DirhuntRobotFileParser(RobotFileParser):
[docs] def read(self):
"""Reads the robots.txt URL and feeds it to the parser."""
try:
with requests.get(self.url) as response:
status_code = response.status_code
text = response.text
except RequestException:
pass
else:
if status_code in (401, 403):
self.disallow_all = True
elif status_code >= 400 and status_code < 500:
self.allow_all = True
self.parse(text.splitlines())
[docs]class Robots(Source):
[docs] def callback(self, domain, protocol='http'):
rp = DirhuntRobotFileParser()
rp.set_url(get_url(protocol, domain, 'robots.txt'))
try:
rp.read()
except (IOError, URLError):
if protocol == 'http':
self.callback(domain, 'https')
return
entries = list(rp.entries)
if rp.default_entry:
entries.append(rp.default_entry)
for ruleline in chain(*[entry.rulelines for entry in entries]):
self.add_result(get_url(protocol, domain, ruleline.path))