File size: 2,113 Bytes
99c2b2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from preprocessing_images import preprocessing_function
from datetime import datetime
from azure.storage.blob import BlobClient
from msrest.authentication import CognitiveServicesCredentials
#importing azure packages
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes

#ocr extraction using azure computer vision API
def azure_ocr(pdf_url,computervision_client):
    try:
        read_response = computervision_client.read(pdf_url,raw=True)
        read_operation_location = read_response.headers["Operation-Location"]
        operation_id = read_operation_location.split("/")[-1]
        while True:
            read_result = computervision_client.get_read_result(operation_id)
            if read_result.status not in ['notStarted', 'running']:
                break
        words = []
        if read_result.status == OperationStatusCodes.succeeded:
            for text_result in read_result.analyze_result.read_results:
                for line in text_result.lines:
                    words.append(line.text)           
        all_text = ' '.join(words)  
        return all_text
    except Exception as e:
        raise Exception(e)
def extract_text_from_url(test_pdf_url):
    try:
        preprocessing_function(test_pdf_url)
        my_blob = 'test_clean_pdf' + datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
        blob = BlobClient.from_connection_string(conn_str=connection_string, container_name= my_container, blob_name=my_blob)
        with open("answer_paper.pdf", "rb") as data:
            blob.upload_blob(data) 
        computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
        text = azure_ocr(blob.url, computervision_client)
        text = text.lower()
        n = text.find("150 word")
        if n > 0:
            text = text[n+10:]

        elif text.find("150 ward") > 0:
            nn = text.find("150 ward")
            text = text[nn+10:]
        return text
    except Exception as e:
        raise Exception(e)