# Content Extraction

Extract specific content (like the title) from each crawled page.

### Extension Code &#x20;

```python

from hypercrawlturbo import scraper
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup

# Define the URL of the webpage to scrape
url_to_scrape = "https://hyperllm.gitbook.io/hyperllm"

# Parse the domain of the URL to scrape
parsed_url = urlparse(url_to_scrape)
base_domain = parsed_url.netloc

# Set the depth limit for crawling
depth_limit = 2

# Store the visited URLs to avoid revisiting
visited_urls = set()

def is_same_domain(url, base_domain):
    parsed_extracted_url = urlparse(url)
    return parsed_extracted_url.netloc == base_domain

def crawl(url, depth):
    if depth > depth_limit or url in visited_urls:
        return
    
    visited_urls.add(url)
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract and process content from the URL
        print(f"Crawling URL: {url}")
        print(f"Title: {soup.title.string if soup.title else 'No title'}")
        
        # Extract URLs from the page
        links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
        
        # Filter and crawl further
        for link in links:
            if is_same_domain(link, base_domain):
                crawl(link, depth + 1)
                
    except requests.RequestException as e:
        print(f"Failed to crawl {url}: {e}")

# Start crawling
crawl(url_to_scrape, 0)

```

### Explanation&#x20;

**Function**:

* Using BeautifulSoup to parse HTML and extract content such as page titles.

**Explanation**:

1. **Parse HTML**: Use `BeautifulSoup` to parse the HTML content of each crawled page.
2. **Extract Title**: Extract the title of each page and print it.
