Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

app.py 3.3 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
  1. import streamlit as st
  2. import warnings
  3. warnings.filterwarnings('ignore')
  4. import os
  5. from io import BytesIO
  6. import torch
  7. from gensim.models import FastText
  8. from gensim.test.utils import common_texts
  9. import pandas as pd
  10. import string
  11. import re
  12. import scipy
  13. from scipy import spatial
  14. import numpy as np
  15. import os
  16. import fasttext
  17. import contextlib
  18. import os
  19. # changing page main title and main icon(logo)
  20. PAGE_CONFIG = {"page_title":"Searching for Similarity!", "page_icon":":book:", "layout":"centered"}
  21. st.set_page_config(**PAGE_CONFIG)
  22. st.sidebar.markdown("This project aims to compute semantic similarity score between a query patent and a database of patents.")
  23. # sidebar header
  24. st.sidebar.subheader("Semantic Similarity")
  25. @st.cache
  26. def read_sc(file, isfile=False):
  27. if isfile:
  28. with open(file, 'r') as file:
  29. data = file.read()
  30. else:
  31. data = file
  32. data = data.strip()
  33. return data
  34. @st.cache
  35. def clean_query(doc):
  36. tokens = doc.split()
  37. re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  38. tokens = [re_punc.sub('', w) for w in tokens]
  39. tokens = [word for word in tokens if word.isalpha()]
  40. tokens = [word for word in tokens if len(word) > 1]
  41. return tokens
  42. def cosine_similarity_score(base_document, query):
  43. with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
  44. model = fasttext.load_model('model\my_model.bin')
  45. base_vector = np.mean([model[word] for word in base_document],axis=0)
  46. query_vector = np.mean([model[word] for word in query],axis=0)
  47. cosine = scipy.spatial.distance.cosine(base_vector, query_vector)
  48. return (round((1-cosine)*100,2),'%')
  49. def rank(cleaned_df, search_criteria, column_name, isfile=False):
  50. cleaned_df = cleaned_df.truncate(after = 10)
  51. search_data = read_sc(search_criteria, isfile)
  52. embed_queries = cleaned_df[column_name].to_list()
  53. tokenized_queries = [clean_query(text) for text in cleaned_df[column_name]]
  54. tokenized_search_data = clean_query(search_data)
  55. cos_scores = []
  56. for tokens_query in tokenized_queries:
  57. cosine_similarity = cosine_similarity_score(tokenized_search_data, tokens_query)
  58. cos_scores.append(cosine_similarity)
  59. cleaned_df['Similarity Score'] = cos_scores
  60. cos_scores = np.hstack(cos_scores)
  61. idx = np.argmax(cos_scores)
  62. sentence = embed_queries[idx]
  63. return cleaned_df, sentence, idx
  64. search_criteria = st.text_area("Please enter your search criteria.", height=100)
  65. query_file = st.file_uploader('Please upload your .csv file containing text to be matched.')
  66. if query_file is not None:
  67. df = pd.read_csv(query_file)
  68. st.write(df)
  69. column_name = st.text_area("Name of the column containing the text to be matched", height=100)
  70. if st.checkbox("fastText", key='fasttext'):
  71. st.write('Calculating similarity scores using FastText')
  72. result, similar_sentence, location = rank(df, search_criteria, column_name, isfile=False)
  73. st.dataframe(result)
  74. st.text('The most similar sentence in your entered database to the query is :')
  75. st.text(similar_sentence)
  76. st.text('The sentence was found at the index :')
  77. st.text(location)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...