jinaai
/

jina-code-embeddings-0.5b

@@ -1,178 +0,0 @@
-from typing import List, Dict, Tuple, Union, Any, Optional
-import os
-import json
-import torch
-from torch import nn
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-from transformers.utils import is_flash_attn_2_available
-class Transformer(nn.Module):
-    def __init__(
-        self,
-        model_name_or_path: str,
-        max_seq_length: int = None,
-        model_args: Dict[str, Any] = None,
-        tokenizer_args: Dict[str, Any] = None,
-        config_args: Dict[str, Any] = None,
-        cache_dir: str = None,
-        do_lower_case: bool = False,
-        tokenizer_name_or_path: str = None,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-        self.config_keys = ["max_seq_length", "do_lower_case"]
-        self.do_lower_case = do_lower_case
-        if model_args is None:
-            model_args = {}
-        if tokenizer_args is None:
-            tokenizer_args = {}
-        if config_args is None:
-            config_args = {}
-        self.config = AutoConfig.from_pretrained(model_name_or_path, **config_args, cache_dir=cache_dir)
-        self.task_names = self.config.task_names
-        self.default_task = model_args.pop('default_task', None)
-        model_args["attn_implementation"] = "flash_attention_2" if is_flash_attn_2_available() else "sdpa"
-        self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=self.config, cache_dir=cache_dir, **model_args)
-        if max_seq_length is not None and "model_max_length" not in tokenizer_args:
-            tokenizer_args["model_max_length"] = max_seq_length
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path,
-            cache_dir=cache_dir,
-            **tokenizer_args,
-        )
-        # No max_seq_length set. Try to infer from model
-        if max_seq_length is None:
-            if (
-                hasattr(self.auto_model, "config")
-                and hasattr(self.auto_model.config, "max_position_embeddings")
-                and hasattr(self.tokenizer, "model_max_length")
-            ):
-                max_seq_length = min(self.auto_model.config.max_position_embeddings, self.tokenizer.model_max_length)
-        self.max_seq_length = max_seq_length
-        if tokenizer_name_or_path is not None:
-            self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__
-    @property
-    def default_task(self):
-        return self._default_task
-    @default_task.setter
-    def default_task(self, task: Union[None, str]):
-        self._validate_task(task)
-        self._default_task = task
-    def _validate_task(self, task: str):
-        if task and task not in self.task_names:
-            raise ValueError(
-                f"Unsupported task '{task}'. "
-                f"Supported tasks are: {', '.join(self.config.task_names)}."
-            )
-    def forward(
-        self,
-        features: Dict[str, torch.Tensor],
-        task: Optional[str] = None
-    ) -> Dict[str, torch.Tensor]:
-        """
-        Forward pass through the model.
-        """
-        features.pop('prompt_length', None)
-        output_states = self.auto_model.forward(
-            **features,
-            output_attentions=False,
-            return_dict=True
-        )
-        output_tokens = output_states[0]
-        features.update({"token_embeddings": output_tokens, "attention_mask": features["attention_mask"]})
-        return features
-    def get_word_embedding_dimension(self) -> int:
-        return self.auto_model.config.hidden_size
-    def tokenize(
-        self,
-        texts: Union[List[str], List[dict], List[Tuple[str, str]]],
-        padding: Union[str, bool] = True
-    ) -> Dict[str, torch.Tensor]:
-        """Tokenizes a text and maps tokens to token-ids"""
-        output = {}
-        if isinstance(texts[0], str):
-            to_tokenize = [texts]
-        elif isinstance(texts[0], dict):
-            to_tokenize = []
-            output["text_keys"] = []
-            for lookup in texts:
-                text_key, text = next(iter(lookup.items()))
-                to_tokenize.append(text)
-                output["text_keys"].append(text_key)
-            to_tokenize = [to_tokenize]
-        else:
-            batch1, batch2 = [], []
-            for text_tuple in texts:
-                batch1.append(text_tuple[0])
-                batch2.append(text_tuple[1])
-            to_tokenize = [batch1, batch2]
-        # strip
-        to_tokenize = [[str(s).strip() for s in col] for col in to_tokenize]
-        # Lowercase
-        if self.do_lower_case:
-            to_tokenize = [[s.lower() for s in col] for col in to_tokenize]
-        output.update(
-            self.tokenizer(
-                *to_tokenize,
-                padding=padding,
-                truncation=True,
-                return_tensors="pt",
-                max_length=self.max_seq_length,
-            )
-        )
-        return output
-    def get_config_dict(self) -> Dict[str, Any]:
-         return {key: self.__dict__[key] for key in self.config_keys}
-    def save(self, output_path: str, safe_serialization: bool = True) -> None:
-        self.auto_model.save_pretrained(output_path, safe_serialization=safe_serialization)
-        self.tokenizer.save_pretrained(output_path)
-        with open(os.path.join(output_path, "sentence_transformer_config.json"), "w") as fOut:
-            json.dump(self.get_config_dict(), fOut, indent=2)
-    @classmethod
-    def load(cls, input_path: str) -> "Transformer":
-        config_name = "sentence_transformer_config.json"
-        stransformer_config_path = os.path.join(input_path, config_name)
-        with open(stransformer_config_path) as fIn:
-            config = json.load(fIn)
-        # Don't allow configs to set trust_remote_code
-        if "model_args" in config and "trust_remote_code" in config["model_args"]:
-            config["model_args"].pop("trust_remote_code")
-        if "tokenizer_args" in config and "trust_remote_code" in config["tokenizer_args"]:
-            config["tokenizer_args"].pop("trust_remote_code")
-        if "config_args" in config and "trust_remote_code" in config["config_args"]:
-            config["config_args"].pop("trust_remote_code")
-        return cls(model_name_or_path=input_path, **config)