Extract specific content (like the title) from each crawled page.
Extension Code
from hypercrawlturbo import scraper
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
# Define the URL of the webpage to scrape
url_to_scrape = "https://hyperllm.gitbook.io/hyperllm"
# Parse the domain of the URL to scrape
parsed_url = urlparse(url_to_scrape)
base_domain = parsed_url.netloc
# Set the depth limit for crawling
depth_limit = 2
# Store the visited URLs to avoid revisiting
visited_urls = set()
def is_same_domain(url, base_domain):
parsed_extracted_url = urlparse(url)
return parsed_extracted_url.netloc == base_domain
def crawl(url, depth):
if depth > depth_limit or url in visited_urls:
return
visited_urls.add(url)
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract and process content from the URL
print(f"Crawling URL: {url}")
print(f"Title: {soup.title.string if soup.title else 'No title'}")
# Extract URLs from the page
links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
# Filter and crawl further
for link in links:
if is_same_domain(link, base_domain):
crawl(link, depth + 1)
except requests.RequestException as e:
print(f"Failed to crawl {url}: {e}")
# Start crawling
crawl(url_to_scrape, 0)
Explanation
Function:
Using BeautifulSoup to parse HTML and extract content such as page titles.
Explanation:
Parse HTML: Use BeautifulSoup to parse the HTML content of each crawled page.
Extract Title: Extract the title of each page and print it.