1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/usr/bin/env python3

import pytesseract
from PIL import Image
import os

# Define the path to the directory with images
directory = os.getcwd()

# Set the pytesseract path to the Tesseract executable
# This is only needed if Tesseract is not in your PATH;
# it's not required in this environment
pytesseract.pytesseract.tesseract_cmd = r"/opt/homebrew/bin/tesseract"


# Function to extract text and save as .md file
def extract_text_and_save(image_path):
    # Open the image file
    img = Image.open(image_path)
    # Use pytesseract to do OCR on the image
    text = pytesseract.image_to_string(img)

    # Create a Markdown filename with the same name as the image file
    md_filename = os.path.splitext(image_path)[0] + ".md"

    # Save the extracted text to the .md file
    with open(md_filename, "w") as f:
        f.write(text)


# Iterate over the files in the directory
for filename in os.listdir(directory):
    if filename.lower().endswith((".jpg")):
        image_path = os.path.join(directory, filename)
        extract_text_and_save(image_path)