Source code for dsci524_group29_webscraping.fetch_html

# fetch_html.py
# author: Lixuan Lin
# date: 2025-01-16

import requests
from requests.exceptions import RequestException

[docs] def fetch_html(url, timeout=10): """ Fetches the HTML content of a given URL. Parameters: url (str): The URL of the webpage to fetch. timeout (int, optional): The maximum time to wait for a response, in seconds. Defaults to 10 seconds. Returns: str: The raw HTML content of the webpage if the request is successful. Raises: ValueError: If the URL provided is invalid or improperly formatted. requests.exceptions.Timeout: If the request times out before receiving a response. requests.exceptions.RequestException: For other issues during the HTTP request, such as connectivity problems or a non-success HTTP status code. Examples: Fetch the HTML content of a webpage: >>> html_content = fetch_html("https://example.com") >>> print(html_content[:100]) # Prints the first 100 characters of the HTML content Notes: - This function uses the `requests` library to perform an HTTP GET request. - Ensure the `requests` library is installed before using this function. """ try: response = requests.get(url, timeout=timeout) response.raise_for_status() return response.text except requests.exceptions.Timeout: raise except RequestException as e: raise ValueError(f"Failed to fetch HTML from {url}: {e}")