from preprocessing_images import preprocessing_function from datetime import datetime from azure.storage.blob import BlobClient from msrest.authentication import CognitiveServicesCredentials #importing azure packages from azure.cognitiveservices.vision.computervision import ComputerVisionClient from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes #ocr extraction using azure computer vision API def azure_ocr(pdf_url,computervision_client): try: read_response = computervision_client.read(pdf_url,raw=True) read_operation_location = read_response.headers["Operation-Location"] operation_id = read_operation_location.split("/")[-1] while True: read_result = computervision_client.get_read_result(operation_id) if read_result.status not in ['notStarted', 'running']: break words = [] if read_result.status == OperationStatusCodes.succeeded: for text_result in read_result.analyze_result.read_results: for line in text_result.lines: words.append(line.text) all_text = ' '.join(words) return all_text except Exception as e: raise Exception(e) def extract_text_from_url(test_pdf_url): try: preprocessing_function(test_pdf_url) my_blob = 'test_clean_pdf' + datetime.now().strftime('%Y_%m_%d_%H_%M_%S') blob = BlobClient.from_connection_string(conn_str=connection_string, container_name= my_container, blob_name=my_blob) with open("answer_paper.pdf", "rb") as data: blob.upload_blob(data) computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key)) text = azure_ocr(blob.url, computervision_client) text = text.lower() n = text.find("150 word") if n > 0: text = text[n+10:] elif text.find("150 ward") > 0: nn = text.find("150 ward") text = text[nn+10:] return text except Exception as e: raise Exception(e)