davideuler
		
	commited on
		
		
					Commit 
							
							·
						
						91e1678
	
1
								Parent(s):
							
							016ba29
								
translator cli
Browse files- translator_cli.py +102 -0
    	
        translator_cli.py
    ADDED
    
    | @@ -0,0 +1,102 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import argparse
         | 
| 2 | 
            +
            import pymupdf
         | 
| 3 | 
            +
            from deep_translator import (
         | 
| 4 | 
            +
                GoogleTranslator,
         | 
| 5 | 
            +
                ChatGptTranslator,
         | 
| 6 | 
            +
            )
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            # Map of supported translators
         | 
| 9 | 
            +
            TRANSLATORS = {
         | 
| 10 | 
            +
                'google': GoogleTranslator,
         | 
| 11 | 
            +
                'chatgpt': ChatGptTranslator,
         | 
| 12 | 
            +
            }
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: str = "Korean", translator_name: str = "google"):
         | 
| 15 | 
            +
                """
         | 
| 16 | 
            +
                Translate a PDF file from source language to target language
         | 
| 17 | 
            +
                
         | 
| 18 | 
            +
                Args:
         | 
| 19 | 
            +
                    input_file: Path to input PDF file
         | 
| 20 | 
            +
                    source_lang: Source language code (e.g. 'en', 'fr')
         | 
| 21 | 
            +
                    target_lang: Target language code (e.g. 'ko', 'ja') 
         | 
| 22 | 
            +
                    layer: Name of the OCG layer (default: "Korean")
         | 
| 23 | 
            +
                    translator_name: Name of the translator to use (default: "google")
         | 
| 24 | 
            +
                """
         | 
| 25 | 
            +
                # Define color "white"
         | 
| 26 | 
            +
                WHITE = pymupdf.pdfcolor["white"]
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                # This flag ensures that text will be dehyphenated after extraction.
         | 
| 29 | 
            +
                textflags = pymupdf.TEXT_DEHYPHENATE
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                # Get the translator class
         | 
| 32 | 
            +
                if translator_name not in TRANSLATORS:
         | 
| 33 | 
            +
                    raise ValueError(f"Unsupported translator: {translator_name}. Available translators: {', '.join(TRANSLATORS.keys())}")
         | 
| 34 | 
            +
                
         | 
| 35 | 
            +
                TranslatorClass = TRANSLATORS[translator_name]
         | 
| 36 | 
            +
                
         | 
| 37 | 
            +
                # Configure the translator
         | 
| 38 | 
            +
                translator = TranslatorClass(source=source_lang, target=target_lang)
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                # Generate output filename
         | 
| 41 | 
            +
                output_file = input_file.rsplit('.', 1)[0] + f'-{target_lang}.pdf'
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                # Open the document
         | 
| 44 | 
            +
                doc = pymupdf.open(input_file)
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                # Define an Optional Content layer in the document.
         | 
| 47 | 
            +
                # Activate it by default.
         | 
| 48 | 
            +
                ocg_xref = doc.add_ocg(layer, on=True)
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                # Iterate over all pages
         | 
| 51 | 
            +
                for page in doc:
         | 
| 52 | 
            +
                    # Extract text grouped like lines in a paragraph.
         | 
| 53 | 
            +
                    blocks = page.get_text("blocks", flags=textflags)
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                    # Every block of text is contained in a rectangle ("bbox")
         | 
| 56 | 
            +
                    for block in blocks:
         | 
| 57 | 
            +
                        bbox = block[:4]  # area containing the text
         | 
| 58 | 
            +
                        text = block[4]  # the text of this block
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                        # Invoke the actual translation
         | 
| 61 | 
            +
                        translated = translator.translate(text)
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                        # Cover the source text with a white rectangle.
         | 
| 64 | 
            +
                        page.draw_rect(bbox, color=None, fill=WHITE, oc=ocg_xref)
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                        # Write the translated text into the original rectangle
         | 
| 67 | 
            +
                        page.insert_htmlbox(
         | 
| 68 | 
            +
                            bbox, translated, css="* {font-family: sans-serif;}", oc=ocg_xref
         | 
| 69 | 
            +
                        )
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                doc.subset_fonts()
         | 
| 72 | 
            +
                doc.ez_save(output_file)
         | 
| 73 | 
            +
                print(f"Translated PDF saved as: {output_file}")
         | 
| 74 | 
            +
             | 
| 75 | 
            +
            def main():
         | 
| 76 | 
            +
                """
         | 
| 77 | 
            +
                  can be invoked like this:
         | 
| 78 | 
            +
                  python translator_cli.py --source english --target zh-CN "/Users/david/Downloads/Level_up_coding_by_ai.pdf"
         | 
| 79 | 
            +
                """
         | 
| 80 | 
            +
                
         | 
| 81 | 
            +
                parser = argparse.ArgumentParser(description='Translate PDF documents.')
         | 
| 82 | 
            +
                parser.add_argument('input_file', help='Input PDF file path')
         | 
| 83 | 
            +
                parser.add_argument('--source', '-s', default='en',
         | 
| 84 | 
            +
                                   help='Source language code (default: en)')
         | 
| 85 | 
            +
                parser.add_argument('--target', '-t', default='ko',
         | 
| 86 | 
            +
                                   help='Target language code (default: ko)')
         | 
| 87 | 
            +
                parser.add_argument('--layer', '-l', default='Korean',
         | 
| 88 | 
            +
                                   help='Name of the OCG layer (default: Korean)')
         | 
| 89 | 
            +
                parser.add_argument('--translator', '-tr', default='google',
         | 
| 90 | 
            +
                                   choices=list(TRANSLATORS.keys()),
         | 
| 91 | 
            +
                                   help='Translator to use (default: google)')
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                args = parser.parse_args()
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                try:
         | 
| 96 | 
            +
                    translate_pdf(args.input_file, args.source, args.target, args.layer, args.translator)
         | 
| 97 | 
            +
                except Exception as e:
         | 
| 98 | 
            +
                    print(f"Error: {str(e)}")
         | 
| 99 | 
            +
                    exit(1)
         | 
| 100 | 
            +
             | 
| 101 | 
            +
            if __name__ == "__main__":
         | 
| 102 | 
            +
                main()
         |