Content Extraction

First step towards HyperScraping

Extract specific content (like the title) from each crawled page.

Extension Code


from hypercrawlturbo import scraper
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup

# Define the URL of the webpage to scrape
url_to_scrape = "https://hyperllm.gitbook.io/hyperllm"

# Parse the domain of the URL to scrape
parsed_url = urlparse(url_to_scrape)
base_domain = parsed_url.netloc

# Set the depth limit for crawling
depth_limit = 2

# Store the visited URLs to avoid revisiting
visited_urls = set()

def is_same_domain(url, base_domain):
    parsed_extracted_url = urlparse(url)
    return parsed_extracted_url.netloc == base_domain

def crawl(url, depth):
    if depth > depth_limit or url in visited_urls:
        return
    
    visited_urls.add(url)
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract and process content from the URL
        print(f"Crawling URL: {url}")
        print(f"Title: {soup.title.string if soup.title else 'No title'}")
        
        # Extract URLs from the page
        links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
        
        # Filter and crawl further
        for link in links:
            if is_same_domain(link, base_domain):
                crawl(link, depth + 1)
                
    except requests.RequestException as e:
        print(f"Failed to crawl {url}: {e}")

# Start crawling
crawl(url_to_scrape, 0)

Explanation

Function:

  • Using BeautifulSoup to parse HTML and extract content such as page titles.

Explanation:

  1. Parse HTML: Use BeautifulSoup to parse the HTML content of each crawled page.

  2. Extract Title: Extract the title of each page and print it.

Last updated