Spaces:

leavoigt
/

vulnerability

Sleeping

App Files Files Community

leavoigt commited on Sep 22, 2023

Commit

1947876

1 Parent(s): cfcd3f8

Delete utils/keyword_extraction.py

Browse files

Files changed (1) hide show

utils/keyword_extraction.py +0 -140

utils/keyword_extraction.py DELETED Viewed

@@ -1,140 +0,0 @@
-import pandas as pd
-# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
-# import nltk
-# nltk.download('stopwords')
-# from nltk.corpus import stopwords
-import pickle
-from typing import List, Text
-import logging
-from summa import keywords
-try:
-    import streamlit as st
-except ImportError:
-    logging.info("Streamlit not installed")
-def sort_coo(coo_matrix):
-    """
-    It takes Coordinate format scipy sparse matrix and extracts info from same.\
-    1. https://kavita-ganesan.com/python-keyword-extraction/#.Y2-TFHbMJPb
-    """
-    tuples = zip(coo_matrix.col, coo_matrix.data)
-    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
-def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
-    """get the feature names and tf-idf score of top n items
-    Params
-    ---------
-    feature_names: list of words from vectorizer
-    sorted_items: tuple returned by sort_coo function defined in  \
-    keyword_extraction.py
-    topn: topn words to be extracted using tfidf
-    Return
-    ----------
-    results: top extracted keywords
-    """
-    #use only topn items from vector
-    sorted_items = sorted_items[:top_n]
-    score_vals = []
-    feature_vals = []
-    # word index and corresponding tf-idf score
-    for idx, score in sorted_items:
-        #keep track of feature name and its corresponding score
-        score_vals.append(round(score, 3))
-        feature_vals.append(feature_names[idx])
-    results= {}
-    for idx in range(len(feature_vals)):
-        results[feature_vals[idx]]=score_vals[idx]
-    return results
-def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n):
-    """
-    TFIDF based keywords extraction
-    Params
-    ---------
-    vectorizer: trained cont vectorizer model
-    tfidfmodel: TFIDF Tranformer model
-    top_n: Top N keywords to be extracted
-    textdata: text data to which needs keyword extraction
-    Return
-    ----------
-    keywords: top extracted keywords
-    """
-    features = vectorizer.get_feature_names_out()
-    tf_idf_vector=tfidfmodel.transform(vectorizer.transform(textdata))
-    sorted_items=sort_coo(tf_idf_vector.tocoo())
-    results=extract_topn_from_vector(features,sorted_items,top_n)
-    keywords = [keyword for keyword in results]
-    return keywords
-def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
-    """
-    TFIDF based keywords extraction
-    Params
-    ---------
-    sdg: which sdg tfidf model to be used
-    sdgdata: text data to which needs keyword extraction
-    Return
-    ----------
-    keywords: top extracted keywords
-    """
-    model_path = "docStore/sdg{}/".format(sdg)
-    vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
-    tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
-    features = vectorizer.get_feature_names_out()
-    tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
-    sorted_items=sort_coo(tf_idf_vector.tocoo())
-    top_n = top_n
-    results=extract_topn_from_vector(features,sorted_items,top_n)
-    keywords = [keyword for keyword in results]
-    return keywords
-@st.cache(allow_output_mutation=True)
-def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]:
-    """
-    wrappper function to perform textrank, uses either ratio or wordcount to
-    extract top keywords limited by words or ratio.
-    1. https://github.com/summanlp/textrank/blob/master/summa/keywords.py
-    Params
-    --------
-    textdata: text data to perform the textrank.
-    ratio: float to limit the number of keywords as proportion of total token \
-        in textdata
-    words: number of keywords to be extracted. Takes priority over ratio if \
-        Non zero. Howevr incase the pagerank returns lesser keywords than \
-        compared to fix value then ratio is used.
-    Return
-    --------
-    results: extracted keywords
-    """
-    if words == 0:
-        logging.info("Textrank using defulat ratio value = 0.1, as no words limit given")
-        results = keywords.keywords(textdata, ratio= ratio).split("\n")
-    else:
-        try:
-            results = keywords.keywords(textdata, words= words).split("\n")
-        except:
-            results = keywords.keywords(textdata, ratio = ratio).split("\n")
-    return results