ocr_by_img.py

# -*- coding:utf-8 -*-
# @Time : 2020/5/29 10:11
# @Author : litao

from PIL import Image
import pytesseract,os,re
import cv2
import argparse
import cv2
import os
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-p", "--preprocess", type=str, default="thresh",
	help="type of preprocessing to be done")
args = vars(ap.parse_args())
class Languages:
    CHS = 'chi_sim'
    ENG = 'eng'

def img_to_str(image_path, lang=Languages.CHS):
    # img = Image.open(image_path)
    # width, height = img.size
    # img.show()
    # mode = img.mode

    # print(img.size)
    # thumb = img.crop((10,42,160,150))
    # img.grab(0,0,250,200)
    # thumb.save("thumb.jpg")
    # image = cv2.imread(image_path)
    # gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # # check to see if we should apply thresholding to preprocess the
    # # image
    # if args["preprocess"] == "thresh":
    #     gray = cv2.threshold(gray, 0, 255,
    #                          cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    # # make a check to see if median blurring should be done to remove
    # # noise
    # elif args["preprocess"] == "blur":
    #     gray = cv2.medianBlur(gray, 3)
    # # write the grayscale image to disk as a temporary file so we can
    # # apply OCR to it
    # filename = "thumb.png"
    # cv2.imwrite(filename, gray)
    # thumb = img.crop((40, 30, 100, 70))
    #img.grab((30, 30, 150, 80))
    # thumb.save("thumb.jpg")
    # ,config="-psm 7 digits"
    img = Image.open(image_path)
    # thumb = img.crop((10,42,160,150))
    # thumb = img.crop((40, 30, 100, 70))
    thumb = img.crop((490, 0, 560, 60))
    thumb.save("thumb.jpg")
    return pytesseract.image_to_string(thumb, lang,config="-psm 7 digits")

def file_path_scan(file_path):
    for filename in os.listdir(file_path):
        path = os.path.join(file_path, filename)
        if not os.path.isfile(path):
            continue
        title = img_to_str(path, lang=Languages.CHS)
        print(title)
        try:
            play_count = re.findall("\d+",title)[0]
            #print(play_count)
        except:
            #print(title)
            play_count= 0
        yield filename,play_count


file_path = r'D:\work_file\word_file_new\litao\num'
for filename,play_count in file_path_scan(file_path):
    time_str = filename.replace(".png","")
    time_str = time_str[0:13] +":"+ time_str[13:15]+":"+ time_str[15:]
    # print(time_str)
    print(time_str,play_count)

# print(img_to_str(r'D:\work_file\word_file_new\litao\screen\2020-04-16 202632.png', lang=Languages.CHS))