ShambhaviCodes
/
SimSem-Patents


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
            import streamlit as st
import warnings
warnings.filterwarnings('ignore')

import os
from io import BytesIO
import torch 
from gensim.models import FastText
from gensim.test.utils import common_texts
import pandas as pd
import string
import re
import scipy
from scipy import spatial
import numpy as np
import os
import fasttext
import contextlib

import os


# changing page main title and main icon(logo)
PAGE_CONFIG = {"page_title":"Searching for Similarity!", "page_icon":":book:", "layout":"centered"}
st.set_page_config(**PAGE_CONFIG)   

st.sidebar.markdown("This project aims to compute semantic similarity score between a query patent and a database of patents.")

# sidebar header
st.sidebar.subheader("Semantic Similarity")

@st.cache
def read_sc(file, isfile=False):
        if isfile:
            with open(file, 'r') as file:
                data = file.read()
        else:
            data = file

        data = data.strip()
        return data


@st.cache
def clean_query(doc):
        tokens = doc.split()
        re_punc = re.compile('[%s]' % re.escape(string.punctuation))
        tokens = [re_punc.sub('', w) for w in tokens]
        tokens = [word for word in tokens if word.isalpha()]
        tokens = [word for word in tokens if len(word) > 1]
        return tokens
    
def cosine_similarity_score(base_document, query):
    with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
        model = fasttext.load_model('model\my_model.bin')
    base_vector = np.mean([model[word] for word in base_document],axis=0)
    query_vector = np.mean([model[word] for word in query],axis=0)
    cosine = scipy.spatial.distance.cosine(base_vector, query_vector)
    return (round((1-cosine)*100,2),'%')


def rank(cleaned_df, search_criteria, column_name, isfile=False):
        cleaned_df = cleaned_df.truncate(after = 10)
        search_data = read_sc(search_criteria, isfile)   
        embed_queries = cleaned_df[column_name].to_list()
        tokenized_queries = [clean_query(text) for text in cleaned_df[column_name]]
        tokenized_search_data = clean_query(search_data)
        cos_scores = []
        for tokens_query in tokenized_queries:
            cosine_similarity = cosine_similarity_score(tokenized_search_data, tokens_query)
            cos_scores.append(cosine_similarity)
        cleaned_df['Similarity Score'] = cos_scores
        cos_scores = np.hstack(cos_scores)
        idx = np.argmax(cos_scores)
        sentence = embed_queries[idx]
        return cleaned_df, sentence, idx

search_criteria = st.text_area("Please enter your search criteria.", height=100)
query_file = st.file_uploader('Please upload your .csv file containing text to be matched.')
if query_file is not None:
            df = pd.read_csv(query_file)
            st.write(df)

column_name = st.text_area("Name of the column containing the text to be matched", height=100)


if st.checkbox("fastText", key='fasttext'):
        st.write('Calculating similarity scores using FastText')
        result, similar_sentence, location  = rank(df, search_criteria, column_name, isfile=False)
        st.dataframe(result)
        st.text('The most similar sentence in your entered database to the query is :')
        st.text(similar_sentence)
        st.text('The sentence was found at the index :')
        st.text(location)