Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

app.py 5.9 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
  1. import streamlit as st
  2. import warnings
  3. warnings.filterwarnings('ignore')
  4. import os
  5. from io import BytesIO
  6. from transformers import AutoTokenizer, AutoModel, BertConfig
  7. import torch
  8. from gensim.models import FastText
  9. from gensim.test.utils import common_texts
  10. import pandas as pd
  11. import string
  12. import re
  13. import scipy
  14. from scipy import spatial
  15. import numpy as np
  16. import os
  17. import os
  18. model = './paraphrase-MiniLM-L6-v2.dvc'
  19. if "DYNO" in os.environ and os.path.isdir(".dvc"):
  20. os.system("dvc config core.no_scm true")
  21. if os.system(f"dvc pull {model}") != 0:
  22. exit("dvc pull failed")
  23. os.system("rm -r .dvc .apt/usr/lib/dvc")
  24. # changing page main title and main icon(logo)
  25. PAGE_CONFIG = {"page_title":"Searching for Similarity!", "page_icon":":book:", "layout":"centered"}
  26. st.set_page_config(**PAGE_CONFIG)
  27. st.sidebar.markdown("This project aims to compute semantic similarity score between a query and a list of sentences.")
  28. st.sidebar.markdown("[Find the project here!](https://dagshub.com/ShambhaviCodes/Semantic_Similarity)")
  29. # sidebar header
  30. st.sidebar.subheader("Semantic Similarity")
  31. @st.cache
  32. def read_sc(file, isfile=False):
  33. if isfile:
  34. with open(file, 'r') as file:
  35. data = file.read()
  36. else:
  37. data = file
  38. data = data.strip()
  39. return data
  40. @st.cache
  41. def mean_pooling(model_output, attention_mask):
  42. token_embeddings = model_output[0] #First element of model_output contains all token embeddings
  43. input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
  44. return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
  45. def cos_score_BERT(cleaned_df, search_criteria, column_name, isfile=False):
  46. cleaned_df = cleaned_df.truncate(after = 10)
  47. search_data = read_sc(search_criteria, isfile)
  48. embed_queries = cleaned_df[column_name].to_list()
  49. print(embed_queries)
  50. model = AutoModel.from_pretrained('./paraphrase-MiniLM-L6-v2')
  51. tokenizer = AutoTokenizer.from_pretrained('./paraphrase-MiniLM-L6-v2')
  52. encoded_text = tokenizer(search_data, padding=True, truncation=True, return_tensors='pt')
  53. # Compute token embeddings
  54. with torch.no_grad():
  55. model_output_text = model(**encoded_text)
  56. embed_text = mean_pooling(model_output_text, encoded_text['attention_mask'])
  57. cos_scores = []
  58. for query in embed_queries:
  59. encoded_queries = tokenizer(query, padding=True, truncation=True, return_tensors='pt')
  60. with torch.no_grad():
  61. model_output_queries = model(**encoded_queries)
  62. pooled_out = mean_pooling(model_output_queries, encoded_queries['attention_mask'])
  63. cosine_similarity_score = util.pytorch_cos_sim(embed_text, pooled_out)[0]
  64. cosine_similarity_score = -cosine_similarity_score.cpu().numpy()
  65. cos_scores.append(cosine_similarity_score)
  66. cos_scores = np.hstack(cos_scores)
  67. idx = np.argmax(cos_scores)
  68. sentence = embed_queries[idx]
  69. cleaned_df['Similarity Score'] = cos_scores
  70. return cleaned_df, sentence, idx
  71. @st.cache
  72. def clean_query(doc):
  73. tokens = doc.split()
  74. re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  75. tokens = [re_punc.sub('', w) for w in tokens]
  76. tokens = [word for word in tokens if word.isalpha()]
  77. tokens = [word for word in tokens if len(word) > 1]
  78. return tokens
  79. def cosine_similarity_score(base_document, query):
  80. model = FastText(vector_size=4, window=3, min_count=1, sentences=common_texts, epochs=10)
  81. base_vector = np.mean([model.wv[word] for word in base_document],axis=0)
  82. query_vector = np.mean([model.wv[word] for word in query],axis=0)
  83. cosine = scipy.spatial.distance.cosine(base_vector, query_vector)
  84. return (round((1-cosine)*100,2),'%')
  85. def rank(cleaned_df, search_criteria, column_name, isfile=False):
  86. cleaned_df = cleaned_df.truncate(after = 10)
  87. search_data = read_sc(search_criteria, isfile)
  88. embed_queries = cleaned_df[column_name].to_list()
  89. tokenized_queries = [clean_query(text) for text in cleaned_df[column_name]]
  90. tokenized_search_data = clean_query(search_data)
  91. cos_scores = []
  92. for tokens_query in tokenized_queries:
  93. cosine_similarity = cosine_similarity_score(tokenized_search_data, tokens_query)
  94. cos_scores.append(cosine_similarity)
  95. cos_scores = np.hstack(cos_scores)
  96. idx = np.argmax(cos_scores)
  97. sentence = embed_queries[idx]
  98. cleaned_df['Similarity Score'] = cos_scores
  99. return cleaned_df, sentence, idx
  100. search_criteria = st.text_area("Please enter your search criteria.", height=100)
  101. query_file = st.file_uploader('Please upload your .csv file containing text to be matched.')
  102. if query_file is not None:
  103. df = pd.read_csv(query_file)
  104. st.write(df)
  105. column_name = st.text_area("Name of the column containing the text to be matched", height=100)
  106. if st.checkbox("fastText", key='fasttext'):
  107. st.write('Calculating similarity scores using FastText')
  108. result, similar_sentence, location = rank(df, search_criteria, column_name, isfile=False)
  109. st.dataframe(result)
  110. st.text('The most similar sentence in your entered database to the query is :')
  111. st.text(similar_sentence)
  112. st.text('The sentence was found at the index :')
  113. st.text(location)
  114. if st.checkbox("BERT", key = 'bert'):
  115. st.write('Calculating similarity scores using BERT')
  116. result, similar_sentence, location = cos_score_BERT(df, search_criteria, column_name, isfile=False)
  117. st.dataframe(result)
  118. st.text('The most similar sentence in your entered database to the query is :')
  119. st.text(similar_sentence)
  120. st.text('The sentence was found at the index :')
  121. st.text(location)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...