Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import re | |
import json | |
# Helper functions | |
def normalize_alpha(text): | |
return re.sub(' \([^()]+?\)', '', text.lower().replace('_', ' ').replace('-', ' ').replace('ō', 'o')) | |
def load_json(path): | |
"""load json object""" | |
with open(path, 'rb') as f: | |
data = json.load(f) | |
print("Loaded json from path: " + str(path)) | |
return data | |
def get_concept_title(c): | |
return c.split('/')[-1].replace('_', ' ') | |
def print_links(lst): | |
for e in lst: | |
# Special formatting for wordnet | |
if '/n/wn/' in e: | |
title = get_concept_title(e.split('/n/wn/')[0]) + f" ({e.split('/n/wn/')[1].replace('_', ' ')})" | |
else: | |
title = get_concept_title(e) | |
st.markdown(f"- [{title}]({'https://conceptnet.io' + e})") | |
def print_list(lst): | |
for e in lst: | |
st.markdown("- " + e) | |
# Load data | |
terms_to_conceptnet = load_json("solution_technology_w_conceptnet_20221222.json") | |
conceptnet_relations = pd.read_excel("conceptnet_relations_filtered_20230111.xlsx") | |
# Build Dictionary | |
from collections import defaultdict | |
term_to_entity = defaultdict(list) | |
for i, elem in enumerate(terms_to_conceptnet): | |
term_to_entity[normalize_alpha(elem['title'])].append(i) | |
if elem['en_page']: | |
term_to_entity[normalize_alpha(elem['en_page'])].append(i) | |
for syn in elem['synonyms']: | |
term_to_entity[normalize_alpha(syn)].append(i) | |
for k in term_to_entity.keys(): | |
term_to_entity[k] = list(set(term_to_entity[k])) | |
all_terms = list(term_to_entity.keys()) | |
# Demo start | |
st.subheader("Query Expansion with ConceptNet") | |
input_text = st.text_input( | |
"Enter Query", | |
"semiconductor", | |
key="theme", | |
) | |
normalized = normalize_alpha(input_text) | |
matched_ids = term_to_entity[normalized] | |
matched_entities = [terms_to_conceptnet[j] for j in matched_ids] | |
# Streamlit specific | |
select_names = [e['title'] + ' / ' + e['en_page'] if e['en_page'] != '' else e['title'] for e in matched_entities] | |
select_to_ent = {e['title']: e for e in matched_entities} | |
entity_term = st.radio(label='Select Wikipedia Entity', options=select_names, index=0) | |
# Match conceptnet | |
if entity_term is not None: | |
entity_term = entity_term.split(' / ')[0] if ' / ' in entity_term else entity_term | |
entity = select_to_ent[entity_term] | |
matched_concepts = entity['conceptnet_items'] | |
if len(entity['en_page']) > 0: | |
st.subheader("English Page") | |
st.markdown(f"[{entity['en_page']}]({'https://en.wikipedia.org/wiki/' + entity['en_page'].replace(' ', '_')})") | |
st.subheader("Japanese Page") | |
st.markdown(f"[{entity['title']}]({'https://ja.wikipedia.org/wiki/' + entity['title']})") | |
if len(entity['wikidata']) > 0: | |
st.subheader("Wikidata Item") | |
st.markdown(f"[{entity['wikidata']}]({'https://www.wikidata.org/wiki/' + entity['wikidata']})") | |
# Get all relations where the head is one of the matched concepts of entity | |
head_match = conceptnet_relations[conceptnet_relations['start'].apply(lambda x: x.split('/n/')[0] in matched_concepts if '/n/' in x else x in matched_concepts)] | |
# Get relevant relations | |
is_a_match = head_match[head_match['relation'] == '/r/IsA']['end'].unique().tolist() | |
used_for_match = head_match[head_match['relation'] == '/r/UsedFor']['end'].unique().tolist() | |
# Wikipedia redirects | |
wiki_suggestions = list(set([normalize_alpha(x) for x in entity['synonyms']])) | |
# Get all relations where the tail is one of the matched concepts of entity | |
tail_match = conceptnet_relations[conceptnet_relations['end'].apply(lambda x: x.split('/n/')[0] in matched_concepts if '/n/' in x else x in matched_concepts)] | |
# Get superclasses | |
superclass_match = tail_match[tail_match['relation'] == '/r/IsA']['start'].unique().tolist() | |
# Conceptnet items that match the wikipedia entity | |
st.subheader("Matched Conceptnet items") | |
if len(matched_concepts) > 0: | |
# print_list(is_a_suggestions) | |
print_links(matched_concepts) | |
# st.write(is_a_suggestions) | |
else: | |
st.write("No Matches") | |
# Conceptnet is-a relations | |
st.subheader("Is a:") | |
if len(is_a_match) > 0: | |
# print_list(is_a_suggestions) | |
print_links(is_a_match) | |
# st.write(is_a_suggestions) | |
else: | |
st.write("No Matches") | |
# Conceptnet used-for relations | |
st.subheader("Used for:") | |
if len(used_for_match) > 0: | |
# print_list(used_for_suggestions) | |
print_links(used_for_match) | |
# st.write(used_for_suggestions) | |
else: | |
st.write("No Matches") | |
# Conceptnet superclasses | |
st.subheader("Superclasses:") | |
if len(superclass_match) > 0: | |
# print_list(is_a_suggestions) | |
print_links(superclass_match) | |
# st.write(is_a_suggestions) | |
else: | |
st.write("No Matches") | |
# Wikidata and conceptnet similar terms | |
st.subheader("Wikipedia Redirects:") | |
print_list(wiki_suggestions) | |
# st.write(suggestions) | |