1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
|
- import streamlit as st
- import warnings
- warnings.filterwarnings('ignore')
- import os
- from io import BytesIO
- from transformers import AutoTokenizer, AutoModel, BertConfig
- import torch
- from gensim.models import FastText
- from gensim.test.utils import common_texts
- import pandas as pd
- import string
- import re
- import scipy
- from scipy import spatial
- import numpy as np
- import os
- import os
- model = './paraphrase-MiniLM-L6-v2.dvc'
- if "DYNO" in os.environ and os.path.isdir(".dvc"):
- os.system("dvc config core.no_scm true")
- if os.system(f"dvc pull {model}") != 0:
- exit("dvc pull failed")
- os.system("rm -r .dvc .apt/usr/lib/dvc")
- # changing page main title and main icon(logo)
- PAGE_CONFIG = {"page_title":"Searching for Similarity!", "page_icon":":book:", "layout":"centered"}
- st.set_page_config(**PAGE_CONFIG)
- st.sidebar.markdown("This project aims to compute semantic similarity score between a query and a list of sentences.")
- st.sidebar.markdown("[Find the project here!](https://dagshub.com/ShambhaviCodes/Semantic_Similarity)")
- # sidebar header
- st.sidebar.subheader("Semantic Similarity")
- @st.cache
- def read_sc(file, isfile=False):
- if isfile:
- with open(file, 'r') as file:
- data = file.read()
- else:
- data = file
- data = data.strip()
- return data
- @st.cache
- def mean_pooling(model_output, attention_mask):
- token_embeddings = model_output[0] #First element of model_output contains all token embeddings
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
- def cos_score_BERT(cleaned_df, search_criteria, column_name, isfile=False):
- cleaned_df = cleaned_df.truncate(after = 10)
- search_data = read_sc(search_criteria, isfile)
- embed_queries = cleaned_df[column_name].to_list()
- print(embed_queries)
- model = AutoModel.from_pretrained('./paraphrase-MiniLM-L6-v2')
- tokenizer = AutoTokenizer.from_pretrained('./paraphrase-MiniLM-L6-v2')
- encoded_text = tokenizer(search_data, padding=True, truncation=True, return_tensors='pt')
- # Compute token embeddings
- with torch.no_grad():
- model_output_text = model(**encoded_text)
- embed_text = mean_pooling(model_output_text, encoded_text['attention_mask'])
- cos_scores = []
- for query in embed_queries:
- encoded_queries = tokenizer(query, padding=True, truncation=True, return_tensors='pt')
- with torch.no_grad():
- model_output_queries = model(**encoded_queries)
- pooled_out = mean_pooling(model_output_queries, encoded_queries['attention_mask'])
- cosine_similarity_score = util.pytorch_cos_sim(embed_text, pooled_out)[0]
- cosine_similarity_score = -cosine_similarity_score.cpu().numpy()
- cos_scores.append(cosine_similarity_score)
- cos_scores = np.hstack(cos_scores)
- idx = np.argmax(cos_scores)
- sentence = embed_queries[idx]
- cleaned_df['Similarity Score'] = cos_scores
- return cleaned_df, sentence, idx
- @st.cache
- def clean_query(doc):
- tokens = doc.split()
- re_punc = re.compile('[%s]' % re.escape(string.punctuation))
- tokens = [re_punc.sub('', w) for w in tokens]
- tokens = [word for word in tokens if word.isalpha()]
- tokens = [word for word in tokens if len(word) > 1]
- return tokens
-
- def cosine_similarity_score(base_document, query):
- model = FastText(vector_size=4, window=3, min_count=1, sentences=common_texts, epochs=10)
- base_vector = np.mean([model.wv[word] for word in base_document],axis=0)
- query_vector = np.mean([model.wv[word] for word in query],axis=0)
- cosine = scipy.spatial.distance.cosine(base_vector, query_vector)
- return (round((1-cosine)*100,2),'%')
- def rank(cleaned_df, search_criteria, column_name, isfile=False):
- cleaned_df = cleaned_df.truncate(after = 10)
- search_data = read_sc(search_criteria, isfile)
- embed_queries = cleaned_df[column_name].to_list()
- tokenized_queries = [clean_query(text) for text in cleaned_df[column_name]]
- tokenized_search_data = clean_query(search_data)
- cos_scores = []
- for tokens_query in tokenized_queries:
- cosine_similarity = cosine_similarity_score(tokenized_search_data, tokens_query)
- cos_scores.append(cosine_similarity)
- cos_scores = np.hstack(cos_scores)
- idx = np.argmax(cos_scores)
- sentence = embed_queries[idx]
- cleaned_df['Similarity Score'] = cos_scores
- return cleaned_df, sentence, idx
- search_criteria = st.text_area("Please enter your search criteria.", height=100)
- query_file = st.file_uploader('Please upload your .csv file containing text to be matched.')
- if query_file is not None:
- df = pd.read_csv(query_file)
- st.write(df)
- column_name = st.text_area("Name of the column containing the text to be matched", height=100)
- if st.checkbox("fastText", key='fasttext'):
- st.write('Calculating similarity scores using FastText')
- result, similar_sentence, location = rank(df, search_criteria, column_name, isfile=False)
- st.dataframe(result)
- st.text('The most similar sentence in your entered database to the query is :')
- st.text(similar_sentence)
- st.text('The sentence was found at the index :')
- st.text(location)
- if st.checkbox("BERT", key = 'bert'):
- st.write('Calculating similarity scores using BERT')
- result, similar_sentence, location = cos_score_BERT(df, search_criteria, column_name, isfile=False)
- st.dataframe(result)
- st.text('The most similar sentence in your entered database to the query is :')
- st.text(similar_sentence)
- st.text('The sentence was found at the index :')
- st.text(location)
|