query_expansion / app.py
naveed-stockmark's picture
Update app.py
bce9eee
import streamlit as st
import pandas as pd
import re
import json
# Helper functions
def normalize_alpha(text):
return re.sub(' \([^()]+?\)', '', text.lower().replace('_', ' ').replace('-', ' ').replace('ō', 'o'))
def load_json(path):
"""load json object"""
with open(path, 'rb') as f:
data = json.load(f)
print("Loaded json from path: " + str(path))
return data
def get_concept_title(c):
return c.split('/')[-1].replace('_', ' ')
def print_links(lst):
for e in lst:
# Special formatting for wordnet
if '/n/wn/' in e:
title = get_concept_title(e.split('/n/wn/')[0]) + f" ({e.split('/n/wn/')[1].replace('_', ' ')})"
else:
title = get_concept_title(e)
st.markdown(f"- [{title}]({'https://conceptnet.io' + e})")
def print_list(lst):
for e in lst:
st.markdown("- " + e)
# Load data
terms_to_conceptnet = load_json("solution_technology_w_conceptnet_20221222.json")
conceptnet_relations = pd.read_excel("conceptnet_relations_filtered_20230111.xlsx")
# Build Dictionary
from collections import defaultdict
term_to_entity = defaultdict(list)
for i, elem in enumerate(terms_to_conceptnet):
term_to_entity[normalize_alpha(elem['title'])].append(i)
if elem['en_page']:
term_to_entity[normalize_alpha(elem['en_page'])].append(i)
for syn in elem['synonyms']:
term_to_entity[normalize_alpha(syn)].append(i)
for k in term_to_entity.keys():
term_to_entity[k] = list(set(term_to_entity[k]))
all_terms = list(term_to_entity.keys())
# Demo start
st.subheader("Query Expansion with ConceptNet")
input_text = st.text_input(
"Enter Query",
"semiconductor",
key="theme",
)
normalized = normalize_alpha(input_text)
matched_ids = term_to_entity[normalized]
matched_entities = [terms_to_conceptnet[j] for j in matched_ids]
# Streamlit specific
select_names = [e['title'] + ' / ' + e['en_page'] if e['en_page'] != '' else e['title'] for e in matched_entities]
select_to_ent = {e['title']: e for e in matched_entities}
entity_term = st.radio(label='Select Wikipedia Entity', options=select_names, index=0)
# Match conceptnet
if entity_term is not None:
entity_term = entity_term.split(' / ')[0] if ' / ' in entity_term else entity_term
entity = select_to_ent[entity_term]
matched_concepts = entity['conceptnet_items']
if len(entity['en_page']) > 0:
st.subheader("English Page")
st.markdown(f"[{entity['en_page']}]({'https://en.wikipedia.org/wiki/' + entity['en_page'].replace(' ', '_')})")
st.subheader("Japanese Page")
st.markdown(f"[{entity['title']}]({'https://ja.wikipedia.org/wiki/' + entity['title']})")
if len(entity['wikidata']) > 0:
st.subheader("Wikidata Item")
st.markdown(f"[{entity['wikidata']}]({'https://www.wikidata.org/wiki/' + entity['wikidata']})")
# Get all relations where the head is one of the matched concepts of entity
head_match = conceptnet_relations[conceptnet_relations['start'].apply(lambda x: x.split('/n/')[0] in matched_concepts if '/n/' in x else x in matched_concepts)]
# Get relevant relations
is_a_match = head_match[head_match['relation'] == '/r/IsA']['end'].unique().tolist()
used_for_match = head_match[head_match['relation'] == '/r/UsedFor']['end'].unique().tolist()
# Wikipedia redirects
wiki_suggestions = list(set([normalize_alpha(x) for x in entity['synonyms']]))
# Get all relations where the tail is one of the matched concepts of entity
tail_match = conceptnet_relations[conceptnet_relations['end'].apply(lambda x: x.split('/n/')[0] in matched_concepts if '/n/' in x else x in matched_concepts)]
# Get superclasses
superclass_match = tail_match[tail_match['relation'] == '/r/IsA']['start'].unique().tolist()
# Conceptnet items that match the wikipedia entity
st.subheader("Matched Conceptnet items")
if len(matched_concepts) > 0:
# print_list(is_a_suggestions)
print_links(matched_concepts)
# st.write(is_a_suggestions)
else:
st.write("No Matches")
# Conceptnet is-a relations
st.subheader("Is a:")
if len(is_a_match) > 0:
# print_list(is_a_suggestions)
print_links(is_a_match)
# st.write(is_a_suggestions)
else:
st.write("No Matches")
# Conceptnet used-for relations
st.subheader("Used for:")
if len(used_for_match) > 0:
# print_list(used_for_suggestions)
print_links(used_for_match)
# st.write(used_for_suggestions)
else:
st.write("No Matches")
# Conceptnet superclasses
st.subheader("Superclasses:")
if len(superclass_match) > 0:
# print_list(is_a_suggestions)
print_links(superclass_match)
# st.write(is_a_suggestions)
else:
st.write("No Matches")
# Wikidata and conceptnet similar terms
st.subheader("Wikipedia Redirects:")
print_list(wiki_suggestions)
# st.write(suggestions)