Extract specific content (like the title) from each crawled page.
Extension Code
from hypercrawlturbo import scraperfrom urllib.parse import urlparse, urljoinimport requestsfrom bs4 import BeautifulSoup# Define the URL of the webpage to scrapeurl_to_scrape ="https://hyperllm.gitbook.io/hyperllm"# Parse the domain of the URL to scrapeparsed_url =urlparse(url_to_scrape)base_domain = parsed_url.netloc# Set the depth limit for crawlingdepth_limit =2# Store the visited URLs to avoid revisitingvisited_urls =set()defis_same_domain(url,base_domain): parsed_extracted_url =urlparse(url)return parsed_extracted_url.netloc == base_domaindefcrawl(url,depth):if depth > depth_limit or url in visited_urls:return visited_urls.add(url)try: response = requests.get(url) response.raise_for_status() soup =BeautifulSoup(response.text, 'html.parser')# Extract and process content from the URLprint(f"Crawling URL: {url}")print(f"Title: {soup.title.string if soup.title else'No title'}")# Extract URLs from the page links = [urljoin(url, a['href'])for a in soup.find_all('a', href=True)]# Filter and crawl furtherfor link in links:ifis_same_domain(link, base_domain):crawl(link, depth +1)except requests.RequestException as e:print(f"Failed to crawl {url}: {e}")# Start crawlingcrawl(url_to_scrape, 0)
Explanation
Function:
Using BeautifulSoup to parse HTML and extract content such as page titles.
Explanation:
Parse HTML: Use BeautifulSoup to parse the HTML content of each crawled page.
Extract Title: Extract the title of each page and print it.