diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ea6e1c6..ca687c39 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.57.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.56.0...v1.57.0) (2025-06-13) + + +### Features + +* add markdownify endpoint ([7340375](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/73403755da1e4c3065e91d834c59f6d8c1825763)) + ## [1.56.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.55.0...v1.56.0) (2025-06-13) diff --git a/examples/markdownify/.env.example b/examples/markdownify/.env.example new file mode 100644 index 00000000..8f19deb6 --- /dev/null +++ b/examples/markdownify/.env.example @@ -0,0 +1 @@ +SCRAPEGRAPH_API_KEY=your SCRAPEGRAPH_API_KEY \ No newline at end of file diff --git a/examples/markdownify/markdownify_scrapegraphai.py b/examples/markdownify/markdownify_scrapegraphai.py new file mode 100644 index 00000000..113fe5cd --- /dev/null +++ b/examples/markdownify/markdownify_scrapegraphai.py @@ -0,0 +1,35 @@ +""" +Example script demonstrating the markdownify functionality +""" + +import os +from dotenv import load_dotenv +from scrapegraph_py import Client +from scrapegraph_py.logger import sgai_logger + +def main(): + # Load environment variables + load_dotenv() + + # Set up logging + sgai_logger.set_logging(level="INFO") + + # Initialize the client + api_key = os.getenv("SCRAPEGRAPH_API_KEY") + if not api_key: + raise ValueError("SCRAPEGRAPH_API_KEY environment variable not found") + sgai_client = Client(api_key=api_key) + + # Example 1: Convert a website to Markdown + print("Example 1: Converting website to Markdown") + print("-" * 50) + response = sgai_client.markdownify( + website_url="https://example.com" + ) + print("Markdown output:") + print(response["result"]) # Access the result key from the dictionary + print("\nMetadata:") + print(response.get("metadata", {})) # Use get() with default value + print("\n" + "=" * 50 + "\n") +if __name__ == "__main__": + main() diff --git a/examples/markdownify/readme.md b/examples/markdownify/readme.md new file mode 100644 index 00000000..46c506df --- /dev/null +++ b/examples/markdownify/readme.md @@ -0,0 +1,75 @@ +# Markdownify Graph Example + +This example demonstrates how to use the Markdownify graph to convert HTML content to Markdown format. + +## Features + +- Convert HTML content to clean, readable Markdown +- Support for both URL and direct HTML input +- Maintains formatting and structure of the original content +- Handles complex HTML elements and nested structures + +## Usage + +```python +from scrapegraphai import Client +from scrapegraphai.logger import sgai_logger + +# Set up logging +sgai_logger.set_logging(level="INFO") + +# Initialize the client +sgai_client = Client(api_key="your-api-key") + +# Example 1: Convert a website to Markdown +response = sgai_client.markdownify( + website_url="https://example.com" +) +print(response.markdown) + +# Example 2: Convert HTML content directly +html_content = """ +
+

Hello World

+

This is a test paragraph.

+
+""" +response = sgai_client.markdownify( + html_content=html_content +) +print(response.markdown) +``` + +## Parameters + +The `markdownify` method accepts the following parameters: + +- `website_url` (str, optional): The URL of the website to convert to Markdown +- `html_content` (str, optional): Direct HTML content to convert to Markdown + +Note: You must provide either `website_url` or `html_content`, but not both. + +## Response + +The response object contains: + +- `markdown` (str): The converted Markdown content +- `metadata` (dict): Additional information about the conversion process + +## Error Handling + +The graph handles various edge cases: + +- Invalid URLs +- Malformed HTML +- Network errors +- Timeout issues + +If an error occurs, it will be logged and raised with appropriate error messages. + +## Best Practices + +1. Always provide a valid URL or well-formed HTML content +2. Use appropriate logging levels for debugging +3. Handle the response appropriately in your application +4. Consider rate limiting for large-scale conversions diff --git a/pyproject.toml b/pyproject.toml index a1fe9b13..2a16ac06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.56.0" +version = "1.57.0" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ diff --git a/scrapegraphai/graphs/markdownify_graph.py b/scrapegraphai/graphs/markdownify_graph.py new file mode 100644 index 00000000..78d33b12 --- /dev/null +++ b/scrapegraphai/graphs/markdownify_graph.py @@ -0,0 +1,83 @@ +""" +markdownify_graph module +""" + +from typing import Dict, List, Optional, Tuple + +from ..nodes import ( + FetchNode, + MarkdownifyNode, +) +from .base_graph import BaseGraph + + +class MarkdownifyGraph(BaseGraph): + """ + A graph that converts HTML content to Markdown format. + + This graph takes a URL or HTML content as input and converts it to clean, readable Markdown. + It uses a two-step process: + 1. Fetch the content (if URL is provided) + 2. Convert the content to Markdown format + + Args: + llm_model: The language model to use for processing + embedder_model: The embedding model to use (optional) + node_config: Additional configuration for the nodes (optional) + + Example: + >>> graph = MarkdownifyGraph( + ... llm_model=your_llm_model, + ... embedder_model=your_embedder_model + ... ) + >>> result, _ = graph.execute({"url": "https://example.com"}) + >>> print(result["markdown"]) + """ + + def __init__( + self, + llm_model, + embedder_model=None, + node_config: Optional[Dict] = None, + ): + # Initialize nodes + fetch_node = FetchNode( + input="url | html", + output=["html_content"], + node_config=node_config, + ) + + markdownify_node = MarkdownifyNode( + input="html_content", + output=["markdown"], + node_config=node_config, + ) + + # Define graph structure + nodes = [fetch_node, markdownify_node] + edges = [(fetch_node, markdownify_node)] + + super().__init__( + nodes=nodes, + edges=edges, + entry_point=fetch_node, + graph_name="Markdownify", + ) + + def execute( + self, initial_state: Dict + ) -> Tuple[Dict, List[Dict]]: + """ + Execute the markdownify graph. + + Args: + initial_state: A dictionary containing either: + - "url": The URL to fetch and convert to markdown + - "html": The HTML content to convert to markdown + + Returns: + Tuple containing: + - Dictionary with the markdown result in the "markdown" key + - List of execution logs + """ + return super().execute(initial_state) \ No newline at end of file diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index 460e3f40..b6917238 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -20,6 +20,7 @@ from .graph_iterator_node import GraphIteratorNode from .html_analyzer_node import HtmlAnalyzerNode from .image_to_text_node import ImageToTextNode +from .markdownify_node import MarkdownifyNode from .merge_answers_node import MergeAnswersNode from .merge_generated_scripts_node import MergeGeneratedScriptsNode from .parse_node import ParseNode @@ -45,6 +46,7 @@ "ParseNode", "ParseNodeDepthK", "RobotsNode", + "MarkdownifyNode", # Analysis nodes "HtmlAnalyzerNode", "GetProbableTagsNode", diff --git a/scrapegraphai/nodes/markdownify_node.py b/scrapegraphai/nodes/markdownify_node.py new file mode 100644 index 00000000..2119908a --- /dev/null +++ b/scrapegraphai/nodes/markdownify_node.py @@ -0,0 +1,67 @@ +""" +MarkdownifyNode Module +""" + +from typing import List, Optional + +from ..utils.convert_to_md import convert_to_md +from .base_node import BaseNode + + +class MarkdownifyNode(BaseNode): + """ + A node responsible for converting HTML content to Markdown format. + + This node takes HTML content from the state and converts it to clean, readable Markdown. + It uses the convert_to_md utility function to perform the conversion. + + Attributes: + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (Optional[dict]): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Markdownify". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "Markdownify", + ): + super().__init__(node_name, "node", input, output, 1, node_config) + + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + + def execute(self, state: dict) -> dict: + """ + Executes the node's logic to convert HTML content to Markdown. + + Args: + state (dict): The current state of the graph. The input keys will be used to fetch the + HTML content from the state. + + Returns: + dict: The updated state with the output key containing the Markdown content. + + Raises: + KeyError: If the input keys are not found in the state, indicating that the + necessary HTML content is missing. + """ + self.logger.info(f"--- Executing {self.node_name} Node ---") + + input_keys = self.get_input_keys(state) + html_content = state[input_keys[0]] + + # Convert HTML to Markdown + markdown_content = convert_to_md(html_content) + + # Update state with markdown content + state.update({self.output[0]: markdown_content}) + + return state \ No newline at end of file diff --git a/uv.lock b/uv.lock index de98ebeb..2ab28277 100644 --- a/uv.lock +++ b/uv.lock @@ -3462,7 +3462,7 @@ wheels = [ [[package]] name = "scrapegraphai" -version = "1.54.1" +version = "1.55.0" source = { editable = "." } dependencies = [ { name = "async-timeout", version = "4.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },