From ec10fc557fcb250778ec2b57677115d18467ad0f Mon Sep 17 00:00:00 2001 From: Juuso Rytilahti <rytilahti.juuso@gmail.com> Date: Mon, 4 Sep 2023 14:18:42 +0300 Subject: [PATCH] Initial commit --- .gitignore | 160 ++++++++++++++++++++++++ LICENSE | 21 ++++ README.md | 73 ++++++++++- debug/Readme.txt | 1 + debug/chunk0_original.md | 3 + debug/chunk0_translation.md | 97 +++++++++++++++ input.md | 1 + output.md | 1 + translator_script.py | 241 ++++++++++++++++++++++++++++++++++++ 9 files changed, 597 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 debug/Readme.txt create mode 100644 debug/chunk0_original.md create mode 100644 debug/chunk0_translation.md create mode 100644 input.md create mode 100644 output.md create mode 100644 translator_script.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..68bc17f --- /dev/null +++ b/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1d22e17 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Juuso Rytilahti + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 4490a7d..2e294ed 100644 --- a/README.md +++ b/README.md @@ -1 +1,72 @@ -# gpt-longtexttranslator +# Table of Contents + +- [Translator script](#translator-script) + - [Important Notes](#important-notes) + - [Initial set-up](#initial-set-up) + - [Workflow](#workflow) + - [Translator FAQ](#translator-faq) + - [1.) How to alter chunk size? (or how to fix exceeding maximum context size -error)](#1-how-to-alter-chunk-size-or-how-to-fix-exceeding-maximum-context-size--error) + - [2.) Why in the output.md the chunk divination is shown?](#2-why-in-the-outputmd-the-chunk-divination-is-shown) + - [3.) Why input and output are using markdown as the file format?](#3-why-input-and-output-are-using-markdown-as-the-file-format) + +# Translator script + +ChatGPT can also localize the translation. For example it can automatically: + +- Localize names, e.g. Ossi -> Oliver. +- Localize team or company names, e.g. *TPS* -> *Arsenal* +- ChatGPT is also really good at translating code snippets (while keeping the code compilable). + - `Pelikortti pataseiska = new Pelikortti(Maa.PATA, 7);` + - `PlayingCard sevenOfSpades = new PlayingCard(Suit.SPADES, 7);`. + +## Important Notes: + +- Due to the context size limitations of current LLMs, the python scripts broke's longer input texts internally in to multiple chunks. The text in `output.md` is combined values from these translations. + +- Currently the script gives 3 previous chunks and their translations as context for the new translation. This is to keep e.g. variable names as the same in across the whole text. + +- The ChatGPT may translate the chunk correctly, but it might alter the structure of the text e.g. it might add additional headlines or modify the original quite heavily. It might hallucinate by adding additional examples. Most often these hallucinations occur on the start of the chunk. It also has a tendency to repeat headline of the previous chunk at the start of a new chunk even though the rest of the chunk has been translated correctly. Because of this it is important look the actual **content** of the translation instead of just the headlines! + +- In the `output.md` also shows the chunk divination. This is because most often if a chunk translation "fails" (e.g. due to hallucination) then the whole chunk fails and the user (you) can skip directly over to the next chunk. + +- Depending on the translated language, the similarity score might be 1 even with succesfull translation. If the console prints that the score is 1, check if the translation is good. With GPT-3.5 Finnish to English this problem did not occur, but with GPT-4 as the base model for translation Finnish to Swedish this did occur one time. That's why if the chosen model is GPT-4 there are no variations generated no matter the similarity score of the currently processed chunk. + +- For now modifying top_p instead of temperature seems to produce better results. + +- Always double check any logic (e.g. mathematical operations) that ChatGPT outputted. + +- Currently the script uses model `gpt-4`. If you decide to use `gpt-3.5-turbo-16k` or `gpt-4-32k`, you can alter the context size to be bigger. For instructions to do that check the [FAQ](#1-how-to-alter-chunk-size-or-how-to-fix-exceeding-maximum-context-size-error) below. + +- ChatGPT has tendency to (sometimes) repeat the translation of previous message instead of translating the new chunk. The provided Python script detects if this is happening and if it is, it replaces the failed translation with the original text. + +- Chunks and their translations can be found from the `debug`-folder. + +## Initial set-up +1. Change the API key. +2. Check the initial prompt. Currently it is set to do Finnish to English translation. IMPORTANT: wite the initial prompt with the target language! (so if the translation is Finnish to Swedish, replace the initial prompt with english language.) +4. Install required libraries. + +## Workflow +1. Paste the text to the `input.md`. You don't need to worry about exceeding maximum context, the script handles that for you. The text can be in plain text or in mardown format. [Here](https://github.com/rytilahti-juuso/ChatMD) you can find a good Python and Javascript templates on how to extract also the "syntactic sugar" from websites. The script requires some alteration if you want to extract e.g. code syntax from the website. +2. Run code in `translator_script.py`. (First run might take long time.) The console prints some of the progress when the process is running. The GPT-4 translation can take up to 2min/chunk, GPT-3.5 is quicker. +3. Finally to see the translated text, open `output.md` with a viewer that supports `markdown` file format. + +## Translator FAQ + +#### 1.) How to alter chunk size? (or how to fix exceeding maximum context size -error) + +Translation chunk size can be altered in two ways: +- Change text chunk size by altering parameter `chunk_size` in `split_into_chunks()`. This is the preferred way if the error message about **exceeding maximum token size** (context) pops up +- You can also try to modify the amount of previous chunks saved in context in `get_messages()`. However the current amount of previous chunks saved in context seems to be okay. + +#### 2.) Why in the output.md the chunk divination is shown? + +The chunks are visible in the `output.md` because, at least in the ChatGPT 3.5, if the translation fails (e.g. ChatGPT starts to hallucinate or the translation is set as the original text) the translation of the whole chunk fails. This way the user can skip to next chunk more easily. **However**, as stated before, ChatGPT has tendency to hallucinate examples and additional headlines, especially in the start of a chunk, so read the chunk a little further than the headline and few starting sentences of each chunk! + +#### 3.) Why input and output are using markdown as the file format? +There are a couple of reasons for this. +- It is well known format and ChatGPT probably knows the syntax of it quite well. (no need to invent the wheel again). + +- The markdown's syntax uses relatively few special characters. This means that it can keep the syntax with very low additional token usage compared other possibilities e.g. HTML format. + +- Additionally, there are also various online converters that can convert other file formats (e.g. HTML) to use markdown. \ No newline at end of file diff --git a/debug/Readme.txt b/debug/Readme.txt new file mode 100644 index 0000000..080cc9a --- /dev/null +++ b/debug/Readme.txt @@ -0,0 +1 @@ +These are all chunks broken and their translations. \ No newline at end of file diff --git a/debug/chunk0_original.md b/debug/chunk0_original.md new file mode 100644 index 0000000..b55d625 --- /dev/null +++ b/debug/chunk0_original.md @@ -0,0 +1,3 @@ +- Here the individual chunks of text will be pasted. this debug approach has been quite good. + +- Currently, if the script generates multiple variations, only the newest variation is saved in the debug translation chunk. \ No newline at end of file diff --git a/debug/chunk0_translation.md b/debug/chunk0_translation.md new file mode 100644 index 0000000..a818a59 --- /dev/null +++ b/debug/chunk0_translation.md @@ -0,0 +1,97 @@ +# Enum-klasser +Förra veckan såg vi några exempel på att lagra konstanta värden som statiska variabler i en klass. I äldre versioner av Java var det faktiskt vanligt att använda statiska variabler på detta sätt: +```java +class Färg { + public static final int SPADER = 1; + public static final int HJÄRTER = 2; + public static final int KLÖVER = 3; + public static final int RUTER = 4; +} + ``` + +Nu kan spelkortsklassen se ut så här +```java +class Spelkort { + private int färg; + private int nummer; + + public Spelkort(int färg, int nummer) { + this.färg = färg; + this.nummer = nummer; + } + + public int getFärg() { + return färg; + } + + public int getNummer() { + return nummer; + } +} + ``` +...och ett nytt spelkort kan skapas så här: +```java +public static void main(String[] args) { + Spelkort spader_sju = new Spelkort(Färg.SPADER, 7); +} + ``` + +Metoden har dock ett klart brist: konstanta numeriska värden associeras inte automatiskt med sina betydelser. Även om vi skulle ha kommit överens om att det numeriska värdet 1 motsvarar spader i en kortlek, returnerar det att begära färgen alltid bara ett heltal: +```java +public static void main(String[] args) { + Spelkort spader_sju = new Spelkort(Färg.SPADER, 7); + System.out.println("Kortets färg: " + spader_sju.getFärg()); + System.out.println("Kortets nummer: " + spader_sju.getNummer()); +} + ``` +Programmet skriver ut: +```java +Kortets färg: 1 +Kortets nummer: 7 + ``` + +En bättre lösning är att använda enum-klassen som introducerades i Java-version fem. +## Vad är en enum-klass? +Syftet med enum-klassen är att definiera en uppsättning värden. Ett typiskt exempel skulle vara kortlekens färger, väderstreck eller Hogwarts hus. +Klassen definieras med nyckelordet enum. I dess enklaste form räcker det att vi skriver de värden som ingår i uppsättningen inom klassdefinitionen, separerade med kommatecken: +```java +enum Färg { + SPADER, HJÄRTER, KLÖVER, RUTER +} + ``` + +Nu kan vi ändra definitionen av spelkortsklassen så att färgen är av typen Färg, inte ett heltal. Detta begränsar också de möjliga värdena: kortets färg kan inte vara något annat än något av de fyra som definieras i enum-klassen Färg. +```java +class Spelkort { + private Färg färg; + private int nummer; + + public Spelkort(Färg färg, int nummer) { + this.färg = färg; + this.nummer = nummer; + } + + public Färg getFärg() { + return färg; + } + + public int getNummer() { + return nummer; + } +} + ``` + +Detta visas nu både när kortet skapas och när kortets färg skrivs ut: +```java +public static void main(String[] args) { + Spelkort spader_sju = new Spelkort(Färg.SPADER, 7); + System.out.println("Kortets färg: " + spader_sju.getFärg()); + System.out.println("Kortets nummer: " + spader_sju.getNummer()); +} + ``` +Nu skriver programmet ut: +```java +Kortets färg: SPADER +Kortets nummer: 7 + ``` +## Användning av enum-värden \ No newline at end of file diff --git a/input.md b/input.md new file mode 100644 index 0000000..d2be789 --- /dev/null +++ b/input.md @@ -0,0 +1 @@ +Paste your input here! \ No newline at end of file diff --git a/output.md b/output.md new file mode 100644 index 0000000..8672a65 --- /dev/null +++ b/output.md @@ -0,0 +1 @@ +After the script has run, the final output will apppear here! \ No newline at end of file diff --git a/translator_script.py b/translator_script.py new file mode 100644 index 0000000..98c03cc --- /dev/null +++ b/translator_script.py @@ -0,0 +1,241 @@ +# General imports +import os +import openai +import copy +# Similarity imports +from transformers import BertTokenizer, BertModel +import torch +from sklearn.metrics.pairwise import cosine_similarity + +# Set the environment variable +os.environ['API_KEY'] = 'PASTE_YOUR_API_KEY_HERE' + +GPT_MODEL = "gpt-4" +""" + If GPT model is 4, the similarity of successfully + translated text chunk may be 1! While the GPT-3.5 was used + this behaviour did not occur +""" + +#Possible roles for individual messages: +#system +#user +#assistant + + + +# return most recent chatGPT answer. +def get_translation_for_chunk(chunk,i, temperature=1, previous_messages=None): + """ + returns the translated string + """ + openai.api_key = os.getenv('API_KEY') + + all_messages = [] + # Add previous messages for chatGPT to use them as example. + if(previous_messages): + all_messages = all_messages + previous_messages + + all_messages.append({ + "role": "system", + "content": INITIAL_PROMPT + }) + + # Add new message to the end + all_messages.append({ + "role": "user", + "content": chunk + }) + + print("API HAS BEEN CALLED!") + # Call API + chat_completion = openai.ChatCompletion.create( + model=GPT_MODEL, + messages=all_messages, + temperature= temperature, + top_p=0.8 + ) + + if 'choices' in chat_completion and chat_completion['choices']: + #print(chat_completion) + return chat_completion['choices'][0]['message']['content'] + else: + return None + +def create_messages(prompt, serverAnswer): + return [ + {"role": "system", "content": INITIAL_PROMPT}, + {"role": "user", "content": prompt}, + {"role": "assistant", "content": serverAnswer} + ] + +def count_words(input_string): + return len(input_string.split()) + +def split_into_chunks(input_string, chunk_size=290): + """ + Args: + input_string: Whole input string, should be in md-format + chunk_size: Maximum size of a chunk in word count. + NOTE: If text is in middle of a code block when the chunk should be broken down, includes it + (so block's word count may exceed the set limit by a little bit) + """ + # Store original text + input_text = input_string + + # Split the input string by newline to get lines + lines = input_string.split('\n') + chunks = [] + current_chunk = [] + word_count = 0 + inside_code_block = False + prev_line= '' + for line in lines: + line_word_count = count_words(line) + # Check for code block start or end + if line.startswith("```java"): + inside_code_block = True + elif "```" in prev_line and "```java" not in prev_line and inside_code_block: + inside_code_block = False + + # If the addition of this line would exceed the chunk size, or + # if it's a code block line, append the current chunk and start a new one + #print(word_count+line_word_count) + if word_count + line_word_count > chunk_size and not inside_code_block: + chunks.append("\n".join(current_chunk)) + current_chunk = [line] + word_count = line_word_count + else: + current_chunk.append(line) + word_count += line_word_count + prev_line = line + + # Add the last chunk if it's not empty + if current_chunk: + chunks.append("\n".join(current_chunk)) + + return chunks + +def write_to_file(file_path, content): + try: + with open(file_path, 'w') as file: + file.write(content) + print("Successfully wrote to the file." + file_path) + except Exception as e: + print(f"An error occurred: {e}") + +def read_from_file(file_path): + try: + with open(file_path, "r") as file: + return file.read() + except FileNotFoundError: + print(f"File not found at: {file_path}") + return None + except IOError: + print(f"Error reading the file: {file_path}") + return None + +def get_recursive_translation(chunk, i, previous_messages, attempt=1, max_attempts=8): + """ + ChatGPT has tendency to (sometimes) repeat the previous translation again in the new message. + If that happens, try to translate the chunk again. + """ + trans = get_translation_for_chunk(chunk, i, previous_messages=copy.deepcopy(previous_messages)) + + if not trans: + return None + + write_to_file("./debug/chunk" + str(i) + "_original.md", chunk) + write_to_file("./debug/chunk" + str(i) + "_translation.md", trans) + + if not previous_messages: + return create_messages(chunk, trans) + + prev_translation = previous_messages[len(previous_messages) - 1]["content"] + if are_texts_similar(prev_translation, trans) and attempt <= max_attempts: + # If similar, recursively try again with varied output + print("Trying a different translation...") + return get_recursive_translation(chunk, i, previous_messages, attempt + 1) + + if attempt > max_attempts: + # If still similar after max attempts, replace with the original + print("The translation is still the same after multiple attempts, replacing the failed translation text with the text from original!") + return create_messages(chunk, chunk) + + return create_messages(chunk, trans) + + +def get_messages(chunk, previous_messages, i): + max_attempts =1 if GPT_MODEL == "gpt-4" else 8 + new_messages = get_recursive_translation(chunk, i, previous_messages, max_attempts=max_attempts) + + # Handle removing the oldest chat message if needed. Is trying to avoid overflow of context + # one chunk is currently 3 messages (system, user, assistant) = in context are + # currently 3 previous chunks + if previous_messages and len(previous_messages) > 9: + previous_messages = previous_messages[3:] + + if previous_messages: + return previous_messages + new_messages + else: + return new_messages + +def get_text_embedding(text): + inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512) + with torch.no_grad(): + outputs = model(**inputs) + # Use the average of the last hidden state as the sentence representation + return outputs.last_hidden_state.mean(dim=1).numpy() + +def are_texts_similar(text1, text2, threshold=0.987): + """ + ChatGPT 3.5 has tendency to repeat translation in previous + message instead of translating the new chunk. + """ + vec1 = get_text_embedding(text1) + vec2 = get_text_embedding(text2) + + similarity = cosine_similarity(vec1, vec2)[0][0] + print("similarity is: " + similarity.astype(str)) + return similarity > threshold + +# ------------ SET-UP ------------ +INITIAL_PROMPT = "You are a translator. Localize and translate the study materials to English. Keep the meaning of the exercise in translation, but it does not need to be literal translation. If there are Finnish names change them to names used in England. Keep every actor the same." +# Load BERT tokenizer and model +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') +model = BertModel.from_pretrained('bert-base-uncased') + +#file_path = "./small_examples/small_example.md" +file_path = "input.md" +file_content = read_from_file(file_path) +# --------------------------------- + + +if file_content: + chunks = split_into_chunks(file_content) + final_text = "" + previous_messages = None + print("input.md has been broken down to "+str(len(chunks)) + " chunks.") + for i, chunk in enumerate(chunks): + print(" ") + print(" ") + print("Currently processing chunk " + str(i)+"/"+str(len(chunks)-1)) + messages = get_messages(chunk, previous_messages, i=i) + + #Include previous messages to the context + if(previous_messages is None): + previous_messages = messages + else: + #TODO take more messages to context if the word count is small enough + # Currently takes only the latest message to account as context. + previous_messages = messages + # Latest element, value of content property + trans = messages[len(messages)-1]["content"] + + #Divination between chuns to add readability (Normally if the translation fails, the translation of the whole chunk fails) + chunk_divination = "\n\n---\n# Chunk "+ str(i)+"\n---\n\n" + + final_text =final_text + chunk_divination + trans + print(" ") + print(" ") + write_to_file("output.md", final_text) -- GitLab