Store the crawled data (URL and title) into a CSV file for further analysis or use.
Extension Code
from hypercrawlturbo import scraperfrom urllib.parse import urlparse, urljoinimport requestsfrom bs4 import BeautifulSoupimport csv# Define the URL of the webpage to scrapeurl_to_scrape ="https://hyperllm.gitbook.io/hyperllm"# Parse the domain of the URL to scrapeparsed_url =urlparse(url_to_scrape)base_domain = parsed_url.netloc# Set the depth limit for crawlingdepth_limit =2# Store the visited URLs to avoid revisitingvisited_urls =set()# Open a CSV file to store resultswithopen('crawled_data.csv', mode='w', newline='')as file: writer = csv.writer(file) writer.writerow(['URL', 'Title'])defis_same_domain(url,base_domain): parsed_extracted_url =urlparse(url)return parsed_extracted_url.netloc == base_domaindefcrawl(url,depth):if depth > depth_limit or url in visited_urls:return visited_urls.add(url)try: response = requests.get(url) response.raise_for_status() soup =BeautifulSoup(response.text, 'html.parser')# Extract and process content from the URL title = soup.title.string if soup.title else'No title'print(f"Crawling URL: {url}")print(f"Title: {title}")# Write the URL and title to the CSV file writer.writerow([url, title])# Extract URLs from the page links = [urljoin(url, a['href'])for a in soup.find_all('a', href=True)]# Filter and crawl furtherfor link in links:ifis_same_domain(link, base_domain):crawl(link, depth +1)except requests.RequestException as e:print(f"Failed to crawl {url}: {e}")# Start crawlingcrawl(url_to_scrape, 0)
Explanation
Function:
Open a CSV file and write headers.
Store each crawled URL and its title in the CSV file.
Explanation:
Open CSV File: Use Python's csv module to open a CSV file and write headers.
Write Data: Write the URL and title of each crawled page into the CSV file.