Depth Limitation

Limit the crawling depth to avoid infinite crawling and control the breadth of the crawl.

Extension Code


from hypercrawlturbo import scraper
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup

# Define the URL of the webpage to scrape
url_to_scrape = "https://hyperllm.gitbook.io/hyperllm"

# Parse the domain of the URL to scrape
parsed_url = urlparse(url_to_scrape)
base_domain = parsed_url.netloc

# Set the depth limit for crawling
depth_limit = 2

# Store the visited URLs to avoid revisiting
visited_urls = set()

def is_same_domain(url, base_domain):
    parsed_extracted_url = urlparse(url)
    return parsed_extracted_url.netloc == base_domain

def crawl(url, depth):
    if depth > depth_limit or url in visited_urls:
        return
    
    visited_urls.add(url)
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract and process content from the URL
        print(f"Crawling URL: {url}")
        print(f"Title: {soup.title.string if soup.title else 'No title'}")
        
        # Extract URLs from the page
        links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
        
        # Filter and crawl further
        for link in links:
            if is_same_domain(link, base_domain):
                crawl(link, depth + 1)
                
    except requests.RequestException as e:
        print(f"Failed to crawl {url}: {e}")

# Start crawling
crawl(url_to_scrape, 0)

Explanation

Function:

  • crawl(url, depth): Recursively crawl URLs up to a specified depth limit.

Explanation:

  1. Depth Limit: Introduce a depth_limit variable to control the maximum depth.

  2. Visited URLs: Use a set to track visited URLs and avoid revisiting them.

  3. Recursive Crawling: Implement recursive crawling in the crawl function, checking the depth before proceeding.

Last updated