Source code for dsci524_group29_webscraping.parse_content

# parse_content.py
# author: Sienko Ikhabi
# date: 2025-01-16

from lxml import html, etree

[docs] def parse_content(html_content, selector, selector_type='css'): """ Parses HTML content to extract data based on the provided selector. Parameters: html_content (str): The raw HTML content to be parsed. selector (str): The query to locate elements in the HTML content. - For CSS selectors: Use `.class`, `#id`, or `tagname`. - For XPath: Use expressions like `//tag[@attribute='value']`. selector_type (str, optional): The type of selector to use. Options: - 'css': Uses a CSS selector (e.g., `.item` selects elements with class "item"). - 'xpath': Uses an XPath expression (e.g., `//div[@class='item']` selects <div> elements with class "item"). Case-insensitive. Default is 'css'. Returns: list: A list of dictionaries containing extracted data. - Example output: `[{'value': 'alfa'}, {'value': 'bravo'}, {'value': 'charlie'}]`. Raises: ValueError: If the selector_type is unsupported or an error occurs during parsing. Example: # Sample HTML content html_content = '<html><body><div class="item">alfa</div><div class="item">bravo</div><div class="item">charlie</div></body></html>' # Using a CSS selector parse_content(html_content, ".item") # Returns: [{'value': 'alfa'}, {'value': 'bravo'}, {'value': 'charlie'}] # Using an XPath selector parse_content(html_content, "//div[@class='item']", selector_type='xpath') # Returns: [{'value': 'alfa'}, {'value': 'bravo'}, {'value': 'charlie'}] """ # Ensure the selector_type is valid if selector_type.lower() not in ['xpath', 'css']: raise ValueError(f"Invalid selector_type '{selector_type}'. Only CSS/XPath selectors are supported.") try: # Parse the HTML content into a document object doc = html.fromstring(html_content) # Extract data based on the selector type if selector_type.lower() == 'css': elements = doc.cssselect(selector) # Use CSS selectors elif selector_type.lower() == 'xpath': elements = doc.xpath(selector) # Use XPath selectors # Extract text content and strip whitespace extracted_data = [{"value": el.text} for el in elements] return extracted_data except Exception as e: raise ValueError("Unable to parse the html_content provided.")