Spaces:
Runtime error
Runtime error
| from typing import List | |
| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import streamlit as st | |
| import matplotlib.pyplot as plt | |
| def calculate_word_overlaps(documents: List[str], query: str): | |
| """ | |
| Calculate the average word overlaps between documents and the query. | |
| """ | |
| query_words = set(query.lower().split()) | |
| word_overlaps = [] | |
| for doc in documents: | |
| doc_words = set(doc.lower().split()) | |
| overlap = len(query_words.intersection(doc_words)) | |
| word_overlaps.append(overlap) | |
| if len(word_overlaps) > 0: | |
| average_word_overlap = np.mean(word_overlaps) | |
| else: | |
| average_word_overlap = 0.0 | |
| return average_word_overlap | |
| def calculate_duplication_rate(documents: List[str]): | |
| """ | |
| Calculate the duplication rate among a list of documents. | |
| """ | |
| total_words_set = set() | |
| total_words = 0 | |
| for doc in documents: | |
| doc_words = doc.lower().split() | |
| total_words_set.update(doc_words) | |
| total_words += len(doc_words) | |
| if total_words > 0: | |
| duplication_rate = (total_words - len(total_words_set)) / total_words | |
| else: | |
| duplication_rate = 0.0 | |
| return duplication_rate | |
| def cosine_similarity_score(documents: List[str], query: str): | |
| """ | |
| Calculate cosine similarity between the query and each document. | |
| """ | |
| tfidf_vectorizer = TfidfVectorizer() | |
| tfidf_matrix = tfidf_vectorizer.fit_transform([query] + documents) | |
| cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]) | |
| return cosine_similarities[0] | |
| def jaccard_similarity_score(documents: List[str], query: str): | |
| """ | |
| Calculate Jaccard similarity between the query and each document. | |
| """ | |
| query_words = set(query.lower().split()) | |
| jaccard_similarities = [] | |
| for doc in documents: | |
| doc_words = set(doc.lower().split()) | |
| intersection_size = len(query_words.intersection(doc_words)) | |
| union_size = len(query_words.union(doc_words)) | |
| jaccard_similarity = intersection_size / union_size if union_size > 0 else 0 | |
| jaccard_similarities.append(jaccard_similarity) | |
| return jaccard_similarities | |
| def display_similarity_results(cosine_scores, jaccard_scores, title): | |
| st.subheader(f"{title} - Cosine Similarity to Query") | |
| plt.bar(range(len(cosine_scores)), cosine_scores) | |
| plt.xlabel("Documents") | |
| plt.ylabel("Cosine Similarity") | |
| st.pyplot(plt) | |
| st.subheader(f"{title} - Jaccard Similarity to Query") | |
| plt.bar(range(len(jaccard_scores)), jaccard_scores, color='orange') | |
| plt.xlabel("Documents") | |
| plt.ylabel("Jaccard Similarity") | |
| st.pyplot(plt) |