Spaces:

NLPGenius
/

CVE-FactChecker

Sleeping

App Files Files Community

CVE-FactChecker / cve_factchecker /embeddings.py

NLPGenius

Fix permission errors, rate limiting, and add English language filtering

e06a21d 3 months ago

raw

history blame

2.16 kB

	from __future__ import annotations
	from typing import List, Any
	import os

	class SimpleDummyEmbeddings:
	def __init__(self, dim: int = 384):
	self.dimension = dim
	def embed_documents(self, texts: List[str]) -> List[List[float]]:
	vecs: List[List[float]] = []
	for t in texts:
	h = abs(hash(t.lower()))
	v = [(float((h >> i) & 1)) for i in range(self.dimension)]
	norm = sum(x * x for x in v) ** 0.5 or 1.0
	vecs.append([x / norm for x in v])
	return vecs
	def embed_query(self, text: str) -> List[float]:
	return self.embed_documents([text])[0]

	def build_embeddings() -> Any:
	# Allow forcing lightweight embeddings to speed up cold starts (e.g., on Spaces)
	if os.environ.get("USE_DUMMY_EMBEDDINGS", "").lower() in ("1", "true", "yes"): # pragma: no cover
	return SimpleDummyEmbeddings()

	try:
	from langchain_huggingface import HuggingFaceEmbeddings # type: ignore
	except Exception:
	try:
	from langchain_community.embeddings import HuggingFaceEmbeddings # type: ignore
	except Exception:
	HuggingFaceEmbeddings = None # type: ignore

	if "HuggingFaceEmbeddings" in locals() and HuggingFaceEmbeddings is not None: # type: ignore
	try:
	# Set cache directory to a writable location
	cache_dir = "/tmp/sentence_transformers"
	if os.path.exists("/data"):
	cache_dir = "/data/sentence_transformers"

	# Set environment variable for sentence-transformers cache
	os.environ['SENTENCE_TRANSFORMERS_HOME'] = cache_dir

	return HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2",
	model_kwargs={"device": "cpu"},
	encode_kwargs={"normalize_embeddings": True},
	cache_folder=cache_dir,
	)
	except Exception as e:
	print(f"⚠️ Could not load HuggingFace embeddings: {e}")
	print("🔄 Using dummy embeddings fallback")

	return SimpleDummyEmbeddings()