Limit the crawling depth to avoid infinite crawling and control the breadth of the crawl.
Extension Code
from hypercrawlturbo import scraper
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
# Define the URL of the webpage to scrape
url_to_scrape = "https://hyperllm.gitbook.io/hyperllm"
# Parse the domain of the URL to scrape
parsed_url = urlparse(url_to_scrape)
base_domain = parsed_url.netloc
# Set the depth limit for crawling
depth_limit = 2
# Store the visited URLs to avoid revisiting
visited_urls = set()
def is_same_domain(url, base_domain):
parsed_extracted_url = urlparse(url)
return parsed_extracted_url.netloc == base_domain
def crawl(url, depth):
if depth > depth_limit or url in visited_urls:
return
visited_urls.add(url)
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract and process content from the URL
print(f"Crawling URL: {url}")
print(f"Title: {soup.title.string if soup.title else 'No title'}")
# Extract URLs from the page
links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
# Filter and crawl further
for link in links:
if is_same_domain(link, base_domain):
crawl(link, depth + 1)
except requests.RequestException as e:
print(f"Failed to crawl {url}: {e}")
# Start crawling
crawl(url_to_scrape, 0)
Explanation
Function:
crawl(url, depth): Recursively crawl URLs up to a specified depth limit.
Explanation:
Depth Limit: Introduce a depth_limit variable to control the maximum depth.
Visited URLs: Use a set to track visited URLs and avoid revisiting them.
Recursive Crawling: Implement recursive crawling in the crawl function, checking the depth before proceeding.