Flight_ATA_Class / extract_text.py
anupam210's picture
Duplicate from ai-based/azure_ocr
99c2b2d
raw
history blame
No virus
2.11 kB
from preprocessing_images import preprocessing_function
from datetime import datetime
from azure.storage.blob import BlobClient
from msrest.authentication import CognitiveServicesCredentials
#importing azure packages
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
#ocr extraction using azure computer vision API
def azure_ocr(pdf_url,computervision_client):
try:
read_response = computervision_client.read(pdf_url,raw=True)
read_operation_location = read_response.headers["Operation-Location"]
operation_id = read_operation_location.split("/")[-1]
while True:
read_result = computervision_client.get_read_result(operation_id)
if read_result.status not in ['notStarted', 'running']:
break
words = []
if read_result.status == OperationStatusCodes.succeeded:
for text_result in read_result.analyze_result.read_results:
for line in text_result.lines:
words.append(line.text)
all_text = ' '.join(words)
return all_text
except Exception as e:
raise Exception(e)
def extract_text_from_url(test_pdf_url):
try:
preprocessing_function(test_pdf_url)
my_blob = 'test_clean_pdf' + datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
blob = BlobClient.from_connection_string(conn_str=connection_string, container_name= my_container, blob_name=my_blob)
with open("answer_paper.pdf", "rb") as data:
blob.upload_blob(data)
computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
text = azure_ocr(blob.url, computervision_client)
text = text.lower()
n = text.find("150 word")
if n > 0:
text = text[n+10:]
elif text.find("150 ward") > 0:
nn = text.find("150 ward")
text = text[nn+10:]
return text
except Exception as e:
raise Exception(e)