Store the crawled data (URL and title) into a CSV file for further analysis or use.
Extension Code
from hypercrawlturbo import scraper
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
import csv
# Define the URL of the webpage to scrape
url_to_scrape = "https://hyperllm.gitbook.io/hyperllm"
# Parse the domain of the URL to scrape
parsed_url = urlparse(url_to_scrape)
base_domain = parsed_url.netloc
# Set the depth limit for crawling
depth_limit = 2
# Store the visited URLs to avoid revisiting
visited_urls = set()
# Open a CSV file to store results
with open('crawled_data.csv', mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['URL', 'Title'])
def is_same_domain(url, base_domain):
parsed_extracted_url = urlparse(url)
return parsed_extracted_url.netloc == base_domain
def crawl(url, depth):
if depth > depth_limit or url in visited_urls:
return
visited_urls.add(url)
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract and process content from the URL
title = soup.title.string if soup.title else 'No title'
print(f"Crawling URL: {url}")
print(f"Title: {title}")
# Write the URL and title to the CSV file
writer.writerow([url, title])
# Extract URLs from the page
links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
# Filter and crawl further
for link in links:
if is_same_domain(link, base_domain):
crawl(link, depth + 1)
except requests.RequestException as e:
print(f"Failed to crawl {url}: {e}")
# Start crawling
crawl(url_to_scrape, 0)
Explanation
Function:
Open a CSV file and write headers.
Store each crawled URL and its title in the CSV file.
Explanation:
Open CSV File: Use Python's csv module to open a CSV file and write headers.
Write Data: Write the URL and title of each crawled page into the CSV file.