diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..1d22e17b15d1b359158576714bd12b61f459d02a --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Juuso Rytilahti + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 1b63aca0605b43133e4d530320b663d83dbeb5b9..cb06ce6bcd9d9a0c6322a08f8175dce7ab6e639a 100644 --- a/README.md +++ b/README.md @@ -1 +1,16 @@ -# utuChatmd +# ChatMD +Convert ChatGPT chats to Markdown (.md) while preserving most of the syntax (e.g. code formatting). Done in Javascript and Python. The majority of the Python script was done by converting the script written in Javascript with ChatGPT to Python. + +## Javascript +1. Navigate to the shared chat url. +2. Copy the JS code from `extract_md_from_ChatGPT.js` to the code snippet's tab on Chrome or paste it directly to the console. +3. Run the snippet (or the code copied on the console). +4. Copy the output of the console. +5. Paste it where you desire! + +## Python +1. Install required libraries (`beautifulsoup4` and `requests`). +2. Update URL on line 48. +3. Run the script. +4. Review the content on the console or in the `output.md` file. + diff --git a/extract_md_from_ChatGPT.js b/extract_md_from_ChatGPT.js new file mode 100644 index 0000000000000000000000000000000000000000..d7b78a20604aa5278a29385040dcc00c4ca5f417 --- /dev/null +++ b/extract_md_from_ChatGPT.js @@ -0,0 +1,44 @@ +// Run the below code on the console. Copies the text content on the opened +// message thread and converts it to md format (e.g. keeps code and headers in correct format). +{ + let mainContent = document.getElementsByTagName('body')[0] + if (mainContent) { + let all_text = ''; + + // Recursive function to traverse child nodes + function traverseNodes(node) { + if (node.nodeType === Node.ELEMENT_NODE) { + let tag = node.tagName.toLowerCase(); + + // Check if the tag is a header tag and prefix it with the appropriate number of `#` + if (['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tag)) { + let headerLevel = parseInt(tag.charAt(1)); // Extract header level from the tag name + let prefix = '#'.repeat(headerLevel); // Create prefix with the appropriate number of `#` + all_text += prefix + ' ' + node.textContent.trim() + '\n'; + } + // If is code block + else if(['pre'].includes(tag)){ + // Replace the "Copy code"- text with empty string. Because before that is the code language name, it can be used directly as Displaying correct syntax with markdown. + all_text += '```' + node.textContent.trim().replace("Copy code", "\n") + '\n ```\n'; + } + else if(['p'].includes(tag)){ + all_text += node.textContent.trim() + '\n\n'; + } + // is user's prompt + else if(['div'].includes(tag) && node.className === "empty:hidden"){ + all_text += "\n___ \n # User \n" + node.textContent.trim() + "\n___ \n # ChatGPT \n"; + } + + + // Traverse child nodes of the current node + Array.from(node.childNodes).forEach(traverseNodes); + } + } + + traverseNodes(mainContent); + + console.log(all_text); + } else { + console.log('No elements were found. ChatGPT UI has probably been updated. Please update the code to parse through correct elements.'); + } + } \ No newline at end of file diff --git a/extract_md_from_ChatGPT.py b/extract_md_from_ChatGPT.py new file mode 100644 index 0000000000000000000000000000000000000000..fc316d0e04b7118f0070107228456c274265688f --- /dev/null +++ b/extract_md_from_ChatGPT.py @@ -0,0 +1,85 @@ +import requests +from bs4 import BeautifulSoup + +# Function to write the scraped content to a file +def write_to_file(file_path, content): + try: + with open(file_path, 'w') as file: + file.write(content) + print("Successfully wrote to the file.") + except Exception as e: + print(f"An error occurred: {e}") + +# Function to fetch the HTML content of a URL +def fetch_url(url): + try: + # Send a GET request to the URL + response = requests.get(url) + # Raise an exception if the request was unsuccessful + response.raise_for_status() + # Return the HTML content as text + return response.text + except requests.RequestException as e: + print(f"An error occurred while fetching the URL: {e}") + return None + +# Function to traverse the HTML nodes and convert them to Markdown text +def traverse_nodes(node, all_text): + new_line = '\n' + # Check if the node is an HTML tag + if node.name: + tag = node.name.lower() + + # Keep the header's ChatGPT presented to user + if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + header_level = int(tag[1]) + prefix = '#' * header_level + all_text += f"{prefix} {node.get_text(strip=False)}\n" + + # Display the code blocks as code. Removing "Copy code" + # ensures that syntax that the code snippet has is always + # to correct language. + elif tag in ['pre']: + all_text += f"```{node.get_text(strip=False).replace('Copy code', new_line)}\n```\n" + + # If it's a paragraph, simply add its text + elif tag in ['p']: + all_text += f"{node.get_text(strip=False)}\n\n" + + # User text is always with set with with class "empty:hidden" + elif tag in ['div'] and 'empty:hidden' in node.get('class', []): + all_text += f"\n___\n# User\n{node.get_text(strip=False)}\n___\n# ChatGPT\n" + + # Recursively traverse the children of the node + for child in node.children: + all_text = traverse_nodes(child, all_text) + + return all_text + + +# Fetch the HTML content of the desired URL +url = "YOUR_URL_HERE" +html_string = fetch_url(url) + + +if html_string: + # Parse the HTML content with BeautifulSoup + soup = BeautifulSoup(html_string, 'html.parser') + # Find the main content of the page (here it is assumed to be within the 'body' tag) + main_content = soup.find('body') + + if main_content: + # Initialize an empty string to store the Markdown text + all_text = '' + # Traverse the HTML nodes and convert them to Markdown + all_text = traverse_nodes(main_content, all_text) + # Write the Markdown text to a file + write_to_file("./output.md", all_text) + print(" ") + print(" Below is the MD output printed to console! ") + print(" ") + print(all_text) + else: + print("No elements were found. ChatGPT UI has probably been updated. Please update the code to parse through correct elements.") +else: + print("Failed to fetch the URL.") \ No newline at end of file diff --git a/output.md b/output.md new file mode 100644 index 0000000000000000000000000000000000000000..3343c7e23c008048508c8fa67969e350dfbd928c --- /dev/null +++ b/output.md @@ -0,0 +1 @@ +# The converted MD file will be written here! \ No newline at end of file