Storing Results

Store the crawled data (URL and title) into a CSV file for further analysis or use.

Extension Code


from hypercrawlturbo import scraper
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
import csv

# Define the URL of the webpage to scrape
url_to_scrape = "https://hyperllm.gitbook.io/hyperllm"

# Parse the domain of the URL to scrape
parsed_url = urlparse(url_to_scrape)
base_domain = parsed_url.netloc

# Set the depth limit for crawling
depth_limit = 2

# Store the visited URLs to avoid revisiting
visited_urls = set()

# Open a CSV file to store results
with open('crawled_data.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['URL', 'Title'])

    def is_same_domain(url, base_domain):
        parsed_extracted_url = urlparse(url)
        return parsed_extracted_url.netloc == base_domain

    def crawl(url, depth):
        if depth > depth_limit or url in visited_urls:
            return
        
        visited_urls.add(url)
        
        try:
            response = requests.get(url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract and process content from the URL
            title = soup.title.string if soup.title else 'No title'
            print(f"Crawling URL: {url}")
            print(f"Title: {title}")
            
            # Write the URL and title to the CSV file
            writer.writerow([url, title])
            
            # Extract URLs from the page
            links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
            
            # Filter and crawl further
            for link in links:
                if is_same_domain(link, base_domain):
                    crawl(link, depth + 1)
                    
        except requests.RequestException as e:
            print(f"Failed to crawl {url}: {e}")

    # Start crawling
    crawl(url_to_scrape, 0)

Explanation

Function:

Open a CSV file and write headers.
Store each crawled URL and its title in the CSV file.

Explanation:

Open CSV File: Use Python's csv module to open a CSV file and write headers.
Write Data: Write the URL and title of each crawled page into the CSV file.

PreviousContent Extraction NextMultithreading

Last updated 1 year ago