Spaces:
Sleeping
Sleeping
ecceembusra
commited on
Commit
·
d223ea8
1
Parent(s):
c788274
Add FAISS vectorstore and metadata
Browse files- .gitignore +2 -0
- providers.py +123 -0
- vectorstore/index.faiss +3 -0
- vectorstore/meta.jsonl +3 -0
- vectorstore/signature.json +3 -0
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv/
|
| 2 |
+
_pycache_/
|
providers.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
# providers.py
|
| 2 |
from typing import List
|
| 3 |
import os
|
|
@@ -118,4 +119,126 @@ def generate(prompt: str) -> str:
|
|
| 118 |
)
|
| 119 |
return response.text.strip() if hasattr(response, "text") else "Cevap oluşturulamadı."
|
| 120 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
return f"LLM hata: {e}"
|
|
|
|
| 1 |
+
<<<<<<< HEAD
|
| 2 |
# providers.py
|
| 3 |
from typing import List
|
| 4 |
import os
|
|
|
|
| 119 |
)
|
| 120 |
return response.text.strip() if hasattr(response, "text") else "Cevap oluşturulamadı."
|
| 121 |
except Exception as e:
|
| 122 |
+
=======
|
| 123 |
+
# providers.py
|
| 124 |
+
from typing import List
|
| 125 |
+
import os
|
| 126 |
+
import numpy as np
|
| 127 |
+
import torch
|
| 128 |
+
from functools import lru_cache
|
| 129 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder
|
| 130 |
+
|
| 131 |
+
from dotenv import load_dotenv
|
| 132 |
+
|
| 133 |
+
# .env dosyasını oku
|
| 134 |
+
load_dotenv()
|
| 135 |
+
|
| 136 |
+
# API anahtarını al
|
| 137 |
+
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
| 138 |
+
if not GOOGLE_API_KEY:
|
| 139 |
+
print("⚠️ Uyarı: GOOGLE_API_KEY .env dosyasında bulunamadı!")
|
| 140 |
+
|
| 141 |
+
# =========================
|
| 142 |
+
# CONFIG (env ile override)
|
| 143 |
+
# =========================
|
| 144 |
+
EMB_MODEL_NAME = os.getenv("EMB_MODEL", "intfloat/multilingual-e5-small")
|
| 145 |
+
# Hız için default MiniLM; Jina kullanmak istersen RERANKER_MODEL=jinaai/jina-reranker-v2-base-multilingual
|
| 146 |
+
RERANKER_NAME = os.getenv("RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
|
| 147 |
+
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-1.5-flash")
|
| 148 |
+
|
| 149 |
+
# =========================
|
| 150 |
+
# Embedding (E5)
|
| 151 |
+
# =========================
|
| 152 |
+
|
| 153 |
+
_emb_model: SentenceTransformer | None = None
|
| 154 |
+
|
| 155 |
+
def _get_emb_model() -> SentenceTransformer:
|
| 156 |
+
global _emb_model
|
| 157 |
+
if _emb_model is None:
|
| 158 |
+
# CPU'da stabil ve hızlı çalışması için
|
| 159 |
+
torch.set_num_threads(max(1, (os.cpu_count() or 4) // 2))
|
| 160 |
+
_emb_model = SentenceTransformer(EMB_MODEL_NAME)
|
| 161 |
+
return _emb_model
|
| 162 |
+
|
| 163 |
+
def embed(texts: List[str]) -> np.ndarray:
|
| 164 |
+
"""E5 embedding üretir (normalize etmez)."""
|
| 165 |
+
model = _get_emb_model()
|
| 166 |
+
vecs = model.encode(
|
| 167 |
+
texts,
|
| 168 |
+
batch_size=32,
|
| 169 |
+
show_progress_bar=False,
|
| 170 |
+
convert_to_numpy=True,
|
| 171 |
+
normalize_embeddings=False,
|
| 172 |
+
)
|
| 173 |
+
return vecs
|
| 174 |
+
|
| 175 |
+
# =========================
|
| 176 |
+
# Reranker (Cross-Encoder)
|
| 177 |
+
# =========================
|
| 178 |
+
|
| 179 |
+
_reranker: CrossEncoder | None = None
|
| 180 |
+
|
| 181 |
+
def _get_reranker() -> CrossEncoder:
|
| 182 |
+
global _reranker
|
| 183 |
+
if _reranker is None:
|
| 184 |
+
|
| 185 |
+
trust = "jina" in RERANKER_NAME.lower()
|
| 186 |
+
_reranker = CrossEncoder(
|
| 187 |
+
RERANKER_NAME,
|
| 188 |
+
max_length=384,
|
| 189 |
+
trust_remote_code=trust,
|
| 190 |
+
)
|
| 191 |
+
return _reranker
|
| 192 |
+
|
| 193 |
+
def rerank(query: str, candidates: List[str]) -> List[float]:
|
| 194 |
+
"""Sorgu + aday pasajlar için alaka skorları döndürür (yüksek skor = daha alakalı)."""
|
| 195 |
+
model = _get_reranker()
|
| 196 |
+
pairs = [[query, c] for c in candidates]
|
| 197 |
+
scores = model.predict(pairs, convert_to_numpy=True, show_progress_bar=False)
|
| 198 |
+
return scores.tolist()
|
| 199 |
+
|
| 200 |
+
# =========================
|
| 201 |
+
# (Opsiyonel) Ekstraktif QA – TR SQuAD
|
| 202 |
+
# =========================
|
| 203 |
+
|
| 204 |
+
_QA_MODEL = os.getenv("QA_MODEL", "savasy/bert-base-turkish-squad")
|
| 205 |
+
_qa_pipe = None # lazy load
|
| 206 |
+
|
| 207 |
+
def qa_extract(question: str, context: str) -> dict:
|
| 208 |
+
"""
|
| 209 |
+
Pasajdan doğrudan cevap span'ı çıkarır.
|
| 210 |
+
Dönen örnek: {'answer': '1907', 'score': 0.93, 'start': 123, 'end': 127}
|
| 211 |
+
Kullanmazsan çağırma; yüklenmez ve hız etkisi olmaz.
|
| 212 |
+
"""
|
| 213 |
+
global _qa_pipe
|
| 214 |
+
if _qa_pipe is None:
|
| 215 |
+
from transformers import pipeline # import burada ki ihtiyaca göre yüklensin
|
| 216 |
+
_qa_pipe = pipeline("question-answering", model=_QA_MODEL, tokenizer=_QA_MODEL)
|
| 217 |
+
res = _qa_pipe(question=question, context=context)
|
| 218 |
+
return dict(res)
|
| 219 |
+
|
| 220 |
+
# =========================
|
| 221 |
+
# LLM: Google Gemini
|
| 222 |
+
# =========================
|
| 223 |
+
|
| 224 |
+
def generate(prompt: str) -> str:
|
| 225 |
+
"""
|
| 226 |
+
Gemini ile üretken cevap. GOOGLE_API_KEY yoksa 'LLM yapılandırılmadı.' döner.
|
| 227 |
+
"""
|
| 228 |
+
api_key = os.getenv("GOOGLE_API_KEY")
|
| 229 |
+
if not api_key:
|
| 230 |
+
return "LLM yapılandırılmadı."
|
| 231 |
+
try:
|
| 232 |
+
import google.generativeai as genai
|
| 233 |
+
genai.configure(api_key=api_key)
|
| 234 |
+
model = genai.GenerativeModel(GEMINI_MODEL)
|
| 235 |
+
response = model.generate_content(
|
| 236 |
+
prompt,
|
| 237 |
+
generation_config=genai.types.GenerationConfig(
|
| 238 |
+
temperature=0.1, max_output_tokens=300, top_p=0.8
|
| 239 |
+
),
|
| 240 |
+
)
|
| 241 |
+
return response.text.strip() if hasattr(response, "text") else "Cevap oluşturulamadı."
|
| 242 |
+
except Exception as e:
|
| 243 |
+
>>>>>>> 1dc9f7f1 (Add FAISS vectorstore and metadata)
|
| 244 |
return f"LLM hata: {e}"
|
vectorstore/index.faiss
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34a05ed4dedffc5f6743e61a3befea015b633e541a5570f89eeaa579d38e8bd7
|
| 3 |
+
size 12167654
|
vectorstore/meta.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cdf60452691daf0cae6a961ae65c73bff232ebad0470008a7cee2a03eac1991a
|
| 3 |
+
size 7850069
|
vectorstore/signature.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28f5b7b82e934b18c2341f84c8a85679bb3eac85aed26a6c938ec188f9e616ed
|
| 3 |
+
size 287
|