Multistep_ugly_implementation

87529cb3 · Juuso Rytilahti · ddfc0be0 · 87529cb3
Commit 87529cb3 authored Oct 6, 2023 by Juuso Rytilahti
--- a/translator_script.py
+++ b/translator_script.py
 # General imports
 import os
+import re
 import openai
 import copy
 # Similarity imports
@@ -61,6 +62,50 @@ def get_translation_for_chunk(chunk,i, temperature=1, previous_messages=None):
        return chat_completion['choices'][0]['message']['content']
    else:
        return None
+# return most recent chatGPT answer.
+def get_improved_translation_for_chunk(chunk, translation, temperature=1, previous_messages=None):
+    """
+    chunk = original text
+    translation = translated version
+    returns the improved version + the rationale
+    """
+    openai.api_key = os.getenv('API_KEY')
+
+    #TODO make language depend on stuff
+    message ="English: \n"+chunk + "\n\n" + "Finnish:\n" + translation
+    
+    all_messages = []
+    # Add previous messages for chatGPT to use them as example.
+    #if(previous_messages):
+    #    all_messages = all_messages + previous_messages
+    
+    all_messages.append({
+                "role": "system", 
+                "content": MULTI_STEP_TRANSLATION_PROMPT
+            })
+    
+    # Add new message to the end
+    all_messages.append({
+                "role": "user", 
+                "content": message
+            })
+    
+    print("API HAS BEEN CALLED!")
+    # Call API
+    chat_completion = openai.ChatCompletion.create(
+        model=GPT_MODEL, 
+        messages=all_messages,
+        temperature= temperature,
+        #TODO test with top_p=0.8
+        top_p=1 # set the top_p to default so we get slightly different versions
+    )
+
+    if 'choices' in chat_completion and chat_completion['choices']:
+        #print(chat_completion)
+        return chat_completion['choices'][0]['message']['content']
+    else:
+        return None
+    

 def create_messages(prompt, serverAnswer):
    return [
@@ -199,13 +244,36 @@ def are_texts_similar(text1, text2, threshold=0.987):
    print("similarity is: " + similarity.astype(str))
    return similarity > threshold

-LATEX_GENERAL_TRANSLATION_PROMPT = "You are a translator. Translate material in the latex file to English. Don't translate the comments. Do not alter the latex syntax, even if you deem it, for example, to miss some elements."
+def extract_final_translation(text):
+    """
+    Extracts content enclosed by <FINAL_TRANSLATION> tags from the provided text.
+
+    Args:
+    - text (str): A string containing <FINAL_TRANSLATION> tags.
+
+    Returns:
+    - str: Extracted content from <FINAL_TRANSLATION> tags.
+    """
+    # Regular expression pattern to find content inside <FINAL_TRANSLATION> tags.
+    pattern = re.compile(r'<FINAL_TRANSLATION>(.*?)<\/FINAL_TRANSLATION>', re.DOTALL)
+    
+    # Searching for the pattern in the text.
+    match = pattern.search(text)
+    
+    # Returning the matched content if it exists, otherwise an empty string.
+    return match.group(1) if match else ''
+
+
+LATEX_GENERAL_TRANSLATION_PROMPT = "You are a translator. Translate material in the latex file to Finnish. Don't translate the comments. Do not alter the latex syntax, even if you deem it, for example, to miss some elements."
 GENERAL_TRANSLATION_PROMPT_PLAIN_TEXT_AND_MD = "You are a translator. Translate the material to English." # Not thoroughly tested, but should work for basic usage.
 TRANSLATE_AND_LOCALIZE_STUDY_MATERIAL_PROMPT_PLAIN_TEXT_OR_MD = "You are a translator. Localize and translate the study materials to English. Keep the meaning of the exercise in translation, but it does not need to be literal translation. If there are Finnish names change them to names used in England. Keep every actor the same."

+# Update the language (and the role) in the prompt to suit your needs. 
+MULTI_STEP_TRANSLATION_PROMPT = "You are a translator and a software developer. Your native language is Finnish. Below you find text that is translated from English to Finnish. Review the translation. Focus on the fluency and accuracy of the text. First, explain your rationale. Then provide a final version in which you have added the improvements. Put the final version inside a custom tag <FINAL_TRANSLATION>."
+
 # ------------ SET-UP ------------
 # Set the initial prompt 
-INITIAL_PROMPT = ""
+INITIAL_PROMPT = LATEX_GENERAL_TRANSLATION_PROMPT
 # Load BERT tokenizer and model
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 model = BertModel.from_pretrained('bert-base-uncased')
@@ -231,6 +299,7 @@ if file_content:
    final_text = ""
    previous_messages = None
    print("input.md has been broken down to "+str(len(chunks)) + " chunks.")
+    if(False):
        for i, chunk in enumerate(chunks):
            print("    ")
            print("    ")
@@ -256,6 +325,21 @@ if file_content:
            # In case the translation fails to an error when only part of the translation is done
            # write the currently translated text also to the output.md
            write_to_file("output.md", final_text)    
+    # Create the better versions
+    if(False):
+        for i, chunk in enumerate(chunks):
+            #TODO add error management The translations should be gained through better way
+            translation = read_from_file('./debug/chunk'+str(i)+'_translation.md')
+            improved_trans = get_improved_translation_for_chunk(chunk, translation)
+            write_to_file("./debug/chunk" + str(i) + "_improved_translation.md", improved_trans)
+            final_text = final_text + improved_trans
+    #Parse the final_text
+    for i, chunk in enumerate(chunks):
+        final_translation = read_from_file('./debug/chunk'+str(i)+'_improved_translation.md')
+        final_translation = extract_final_translation(final_translation)
+        write_to_file("./debug/chunk" + str(i) + "_final_translation.md", final_translation)
+        final_text = final_text+final_translation
+
    print("  ")
    print("  ")
    write_to_file("output.md", final_text)