Skip to content

Commit e54ed1f

Browse files
src/__init__.py: get_tessdata(): improve search for tesseract data.
Also try `tesseract --list-langs`. And look for 'tesseract' as well as 'tesseract-ocr' - works on macos. May help with #4565.
1 parent 0f217b6 commit e54ed1f

File tree

1 file changed

+28
-13
lines changed

1 file changed

+28
-13
lines changed

src/__init__.py

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17802,6 +17802,14 @@ def get_tessdata(tessdata=None):
1780217802
# Try to locate the tesseract-ocr installation.
1780317803

1780417804
import subprocess
17805+
17806+
cp = subprocess.run('tesseract --list-langs', shell=1, capture_output=1, check=0, text=True)
17807+
if cp.returncode == 0:
17808+
m = re.search('List of available languages in "(.+)"', cp.stdout)
17809+
if m:
17810+
tessdata = m.group(1)
17811+
return tessdata
17812+
1780517813
# Windows systems:
1780617814
if sys.platform == "win32":
1780717815
cp = subprocess.run("where tesseract", shell=1, capture_output=1, check=0, text=True)
@@ -17816,20 +17824,27 @@ def get_tessdata(tessdata=None):
1781617824
raise RuntimeError("No tessdata specified and Tesseract installation has no {tessdata} folder")
1781717825

1781817826
# Unix-like systems:
17819-
cp = subprocess.run("whereis tesseract-ocr", shell=1, capture_output=1, check=0, text=True)
17820-
response = cp.stdout.strip().split()
17821-
if cp.returncode or len(response) != 2: # if not 2 tokens: no tesseract-ocr
17822-
raise RuntimeError("No tessdata specified and Tesseract is not installed")
17823-
17824-
# search tessdata in folder structure
17825-
dirname = response[1] # contains tesseract-ocr installation folder
17826-
pattern = f"{dirname}/*/tessdata"
17827-
tessdatas = glob.glob(pattern)
17828-
tessdatas.sort()
17829-
if tessdatas:
17830-
return tessdatas[-1]
17827+
attempts = list()
17828+
for path in 'tesseract-ocr', 'tesseract':
17829+
cp = subprocess.run(f'whereis {path}', shell=1, capture_output=1, check=0, text=True)
17830+
if cp.returncode == 0:
17831+
response = cp.stdout.strip().split()
17832+
if len(response) == 2:
17833+
# search tessdata in folder structure
17834+
dirname = response[1] # contains tesseract-ocr installation folder
17835+
pattern = f"{dirname}/*/tessdata"
17836+
attempts.append(pattern)
17837+
tessdatas = glob.glob(pattern)
17838+
tessdatas.sort()
17839+
if tessdatas:
17840+
return tessdatas[-1]
17841+
if attempts:
17842+
text = 'No tessdata specified and no match for:\n'
17843+
for attempt in attempts:
17844+
text += f' {attempt}'
17845+
raise RuntimeError(text)
1783117846
else:
17832-
raise RuntimeError("No tessdata specified and Tesseract installation has no {pattern} folder.")
17847+
raise RuntimeError('No tessdata specified and Tesseract is not installed')
1783317848

1783417849

1783517850
def css_for_pymupdf_font(

0 commit comments

Comments
 (0)