Spaces:

EarthSpeciesProject
/

NatureLM-Audio

Running on Zero

App Files Files Community

gagannarula commited on Aug 17

Commit

32d3fde

verified ·

1 Parent(s): 1fe07a9

App-redesign (#1)

Browse files

- New app design, removing unnecessary code (5c7c0e25f77218ec5162453df6d15fcd71550f82)
- Merge branch 'gagan/redo-design' into pr/1 (1b7da9f7b82d5953b20899553e700fee83290a79)
- Small fix in launch (8ff0489f2c0201c4233e566b85acbd78ccfc632c)

Files changed (17) hide show

NatureLM/augmentations.py +0 -349
NatureLM/checkpoint_utils.py +2 -27
NatureLM/dataset.py +0 -550
NatureLM/dist_utils.py +0 -109
NatureLM/infer.py +83 -33
NatureLM/logger.py +0 -190
NatureLM/models/NatureLM.py +1 -1
NatureLM/optims.py +0 -154
NatureLM/processors.py +2 -2
NatureLM/runner.py +0 -515
NatureLM/storage_utils.py +0 -26
NatureLM/task_metric_utils.py +0 -283
NatureLM/task_metrics.py +0 -128
NatureLM/utils.py +1 -26
Space.yaml +1 -1
app.py +252 -319
requirements.txt +19 -31

NatureLM/augmentations.py DELETED Viewed

@@ -1,349 +0,0 @@
-import logging
-import random
-import numpy as np
-import torch as th
-from torch import nn
-from torch.nn import functional as F
-from NatureLM.utils import mel_frequencies
-logger = logging.getLogger(__name__)
-class RevEcho(nn.Module):
-    """
-    Hacky Reverb but runs on GPU without slowing down training. This reverb adds a
-    succession of attenuated echos of the input signal to itself. Intuitively, the delay
-    of the first echo will happen after roughly 2x the radius of the room and is
-    controlled by `first_delay`. Then RevEcho keeps adding echos with the same delay and
-    further attenuation until the amplitude ratio between the last and first echo is
-    1e-3. The attenuation factor and the number of echos to adds is controlled by RT60
-    (measured in seconds). RT60 is the average time to get to -60dB (n.b. volume is
-    measured over the squared amplitude so this matches the 1e-3 ratio).
-    At each call to RevEcho, `first_delay`, `initial` and `RT60` are sampled from their
-    range. Then, to prevent this reverb from being too regular, the delay time is
-    resampled uniformly within `first_delay +/- 10%`, as controlled by the `jitter`
-    parameter.
-    Finally, for a denser reverb, multiple trains of echos are added with different
-    jitter noises.
-    Args:
-        - initial: amplitude of the first echo as a fraction of the input signal. For
-          each sample, actually sampled from `[0, initial]`. Larger values means louder
-          reverb. Physically, this would depend on the absorption of the room walls.
-        - rt60: range of values to sample the RT60 in seconds, i.e. after RT60 seconds,
-          the echo amplitude is 1e-3 of the first echo. The default values follow the
-          recommendations of https://arxiv.org/ftp/arxiv/papers/2001/2001.08662.pdf,
-          Section 2.4. Physically this would also be related to the absorption of the
-          room walls and there is likely a relation between `RT60` and `initial`, which
-          we ignore here.
-        - first_delay: range of values to sample the first echo delay in seconds. The
-          default values are equivalent to sampling a room of 3 to 10 meters.
-        - repeat: how many train of echos with differents jitters to add. Higher values
-          means a denser reverb.
-        - jitter: jitter used to make each repetition of the reverb echo train slightly
-          different. For instance a jitter of 0.1 means the delay between two echos will
-          be in the range `first_delay +- 10%`, with the jittering noise being resampled
-          after each single echo.
-        - keep_clean: fraction of the reverb of the clean speech to add back to the
-          ground truth. 0 = dereverberation, 1 = no dereverberation.
-        - sample_rate: sample rate of the input signals.
-    """
-    def __init__(
-        self,
-        proba=0.5,
-        initial=0.3,
-        rt60=(0.3, 1.3),
-        first_delay=(0.01, 0.03),
-        repeat=3,
-        jitter=0.1,
-        keep_clean=0.1,
-        sample_rate=16000,
-        rng=None,
-        seed=42,
-    ):
-        super().__init__()
-        self.proba = proba
-        self.initial = initial
-        self.rt60 = rt60
-        self.first_delay = first_delay
-        self.repeat = repeat
-        self.jitter = jitter
-        self.keep_clean = keep_clean
-        self.sample_rate = sample_rate
-        self.seed = seed
-        self.rng = rng if rng is not None else random.Random(self.seed)
-    def _reverb(self, source, initial, first_delay, rt60):
-        """
-        Return the reverb for a single source.
-        """
-        length = source.shape[-1]
-        reverb = th.zeros_like(source)
-        for _ in range(self.repeat):
-            frac = 1  # what fraction of the first echo amplitude is still here
-            echo = initial * source
-            while frac > 1e-3:
-                # First jitter noise for the delay
-                jitter = 1 + self.jitter * self.rng.uniform(-1, 1)
-                delay = min(1 + int(jitter * first_delay * self.sample_rate), length)
-                # Delay the echo in time by padding with zero on the left
-                echo = F.pad(echo[:, :, :-delay], (delay, 0))
-                reverb += echo
-                # Second jitter noise for the attenuation
-                jitter = 1 + self.jitter * self.rng.uniform(-1, 1)
-                # we want, with `d` the attenuation, d**(rt60 / first_ms) = 1e-3
-                # i.e. log10(d) = -3 * first_ms / rt60, so that
-                attenuation = 10 ** (-3 * jitter * first_delay / rt60)
-                echo *= attenuation
-                frac *= attenuation
-        return reverb
-    def forward(self, samples):
-        if self.rng.random() >= self.proba:
-            return samples
-        raw_wav = samples.get("raw_wav", None)
-        # add channel dimension if not exist
-        if raw_wav.dim() == 2:
-            raw_wav = raw_wav.unsqueeze(1)
-        # Sample characteristics for the reverb
-        initial = self.rng.random() * self.initial
-        first_delay = self.rng.uniform(*self.first_delay)
-        rt60 = self.rng.uniform(*self.rt60)
-        reverb_wav = self._reverb(raw_wav, initial, first_delay, rt60)
-        raw_wav += self.keep_clean * reverb_wav
-        # remove channel dimension
-        if raw_wav.dim() == 3 and raw_wav.shape[1] == 1:
-            raw_wav = raw_wav.squeeze(1)
-        samples["raw_wav"] = raw_wav
-        return samples
-class BandMask(nn.Module):
-    """
-    Maskes bands of frequencies. Similar to Park, Daniel S., et al.
-    "Specaugment: A simple data augmentation method for automatic speech recognition."
-    (https://arxiv.org/pdf/1904.08779.pdf) but over the waveform.
-    """
-    def __init__(self, maxwidth=0.2, bands=120, sample_rate=16_000, rng=None, seed=42):
-        """__init__.
-        :param maxwidth: the maximum width to remove
-        :param bands: number of bands
-        :param sample_rate: signal sample rate
-        """
-        super().__init__()
-        self.maxwidth = maxwidth
-        self.bands = bands
-        self.sample_rate = sample_rate
-        self.seed = seed
-        self.rng = rng if rng is not None else random.Random(self.seed)
-    def forward(self, samples):
-        raw_wav = samples.get("raw_wav", None)
-        # add channel dimension if not exist
-        if raw_wav.dim() == 2:
-            raw_wav = raw_wav.unsqueeze(1)
-        bands = self.bands
-        bandwidth = int(abs(self.maxwidth) * bands)
-        mels = mel_frequencies(bands, 40, self.sample_rate / 2) / self.sample_rate
-        low = self.rng.randrange(bands)
-        high = self.rng.randrange(low, min(bands, low + bandwidth))
-        filters = LowPassFilters([mels[low], mels[high]]).to(raw_wav.device)
-        low, midlow = filters(raw_wav)
-        # band pass filtering
-        out = raw_wav - midlow + low
-        # remove channel dimension
-        if out.dim() == 3 and out.shape[1] == 1:
-            out = out.squeeze(1)
-        samples["raw_wav"] = out
-        return samples
-class Shift(nn.Module):
-    def __init__(self, shift=8192, same=False, rngth=None):
-        """
-        :param shift: randomly shifts the signals up to a given factor
-        :param same: shifts both clean and noisy files by the same factor
-        """
-        super().__init__()
-        self.shift = shift
-        self.same = same
-        self.rngth = rngth
-    def forward(self, samples):
-        raw_wav = samples.get("raw_wav", None)
-        batch, channels, length = raw_wav.shape
-        length = length - self.shift
-        if self.shift > 0:
-            offsets = th.randint(
-                self.shift, [1 if self.same else batch, 1, 1], device=raw_wav.device, generator=self.rngth
-            )
-            offsets = offsets.expand(-1, channels, -1)
-            indexes = th.arange(length, device=raw_wav.device)
-            import pdb
-            pdb.set_trace()
-            raw_wav = raw_wav.gather(2, indexes + offsets)
-        samples["raw_wav"] = raw_wav
-        return samples
-class TimeScale(nn.Module):
-    """Fast time scale."""
-    def __init__(self, scale=2.0, target=1, rngnp=None, seed=42):
-        """
-        :param scale: randomly scales up to this maximum factor
-        """
-        super().__init__()
-        self.scale = scale
-        self.target = target
-        self.seed = seed
-        self.rngnp = rngnp if rngnp is not None else np.random.default_rng(seed=self.seed)
-    def forward(self, samples):
-        try:
-            raw_wav = samples.get("raw_wav")
-        except KeyError:
-            logger.error("Missing required key 'raw_wav' in samples dict")
-            raise
-        if "padding_mask" in samples:
-            masks = samples.get("padding_mask")
-        else:
-            masks = th.ones_like(raw_wav)
-        # add channel dimension if not exist
-        if raw_wav.dim() == 2:
-            raw_wav = raw_wav.unsqueeze(1)
-            masks = masks.unsqueeze(1)
-        # what to augment: noise, clean, or both
-        if self.target == -1:
-            targets = [i for i in range(raw_wav.shape[0])]
-        else:
-            targets = [self.target]
-        for t in targets:
-            signal = raw_wav[t]
-            scaling = np.power(self.scale, self.rngnp.uniform(-1, 1))
-            output_size = int(signal.shape[-1] * scaling)
-            ref = th.arange(output_size, device=signal.device, dtype=signal.dtype).div_(scaling)
-            ref1 = ref.clone().type(th.int64)
-            ref2 = th.min(ref1 + 1, th.full_like(ref1, signal.shape[-1] - 1, dtype=th.int64))
-            r = ref - ref1.type(ref.type())
-            scaled_signal = signal[..., ref1] * (1 - r) + signal[..., ref2] * r
-            scaled_masks = masks[t][..., ref1] * (1 - r) + masks[t][..., ref2] * r
-            # trim or zero pad to the original size
-            if scaled_signal.shape[-1] > signal.shape[-1]:
-                nframes_offset = (scaled_signal.shape[-1] - signal.shape[-1]) // 2
-                scaled_signal = scaled_signal[..., nframes_offset : nframes_offset + signal.shape[-1]]
-                scaled_masks = scaled_masks[..., nframes_offset : nframes_offset + signal.shape[-1]]
-            else:
-                nframes_diff = signal.shape[-1] - scaled_signal.shape[-1]
-                pad_left = int(np.random.uniform() * nframes_diff)
-                pad_right = nframes_diff - pad_left
-                scaled_signal = F.pad(
-                    input=scaled_signal, pad=(pad_left, pad_right, 0, 0, 0, 0), mode="constant", value=0
-                )
-                scaled_masks = F.pad(
-                    input=scaled_masks, pad=(pad_left, pad_right, 0, 0, 0, 0), mode="constant", value=0
-                )
-            raw_wav[t] = scaled_signal
-            masks[t] = scaled_masks
-        # remove channel dimension
-        if raw_wav.dim() == 3 and raw_wav.shape[1] == 1:
-            raw_wav = raw_wav.squeeze(1)
-            masks = masks.squeeze(1)
-        samples["raw_wav"] = raw_wav
-        samples["padding_mask"] = masks
-        return samples
-class Flip(nn.Module):
-    def __init__(self, p=0.0, rngth=None):
-        super(Flip, self).__init__()
-        self.p = p
-        self.rngth = rngth
-    def forward(self, samples):
-        raw_wav = samples["raw_wav"]
-        if raw_wav.dim() > 2:
-            flip_mask = th.rand(raw_wav.shape[0], device=raw_wav.device, generator=self.rngth) <= self.p
-            raw_wav[flip_mask] = raw_wav[flip_mask].flip(-1)
-        else:
-            if th.rand(1, generator=self.rngth) <= self.p:
-                raw_wav = raw_wav.flip(0)
-        samples["raw_wav"] = raw_wav
-        return samples
-class LowPassFilters(th.nn.Module):
-    """
-    Bank of low pass filters.
-    Args:
-        cutoffs (list[float]): list of cutoff frequencies, in [0, 1] expressed as `f/f_s` where
-            f_s is the samplerate.
-        width (int | None): width of the filters (i.e. kernel_size=2 * width + 1).
-            Default to `2 / min(cutoffs)`. Longer filters will have better attenuation
-            but more side effects.
-    Shape:
-        - Input: `(*, T)`
-        - Output: `(F, *, T` with `F` the len of `cutoffs`.
-    """
-    def __init__(self, cutoffs: list, width: int | None = None):
-        super().__init__()
-        self.cutoffs = cutoffs
-        if not width:
-            width = int(2 / min(cutoffs))
-        self.width = width
-        window = th.hamming_window(2 * width + 1, periodic=False)
-        t = np.arange(-width, width + 1, dtype=np.float32)
-        filters = []
-        for cutoff in cutoffs:
-            sinc = th.from_numpy(np.sinc(2 * cutoff * t))
-            filters.append(2 * cutoff * sinc * window)
-        self.register_buffer("filters", th.stack(filters).unsqueeze(1))
-    def forward(self, input):
-        *others, t = input.shape
-        input = input.view(-1, 1, t)
-        out = F.conv1d(input, self.filters, padding=self.width)
-        return out.permute(1, 0, 2).reshape(-1, *others, t)
-    def __repr__(self):
-        return "LossPassFilters(width={},cutoffs={})".format(self.width, self.cutoffs)

NatureLM/checkpoint_utils.py CHANGED Viewed

@@ -42,27 +42,6 @@ def get_state_dict(model, drop_untrained_params: bool = True) -> dict[str, Any]:
     return state_dict
-def torch_save_to_bucket(save_obj: Any, save_path: Union[str, os.PathLike], compress: bool = True) -> None:
-    """Save an object directly to GCS bucket without intermediate disk storage.
-    Args:
-        save_obj: Object to save (usually model state dict or checkpoint)
-        save_path: Path to save in GCS bucket (must be gs:// path)
-        compress: Whether to use compression. Default: True
-    """
-    if not is_gcs_path(save_path):
-        raise ValueError("save_path must be a GCS path")
-    # save to a temporary local file and then upload to GCS
-    with tempfile.NamedTemporaryFile() as tmp:
-        torch.save(save_obj, tmp.name, _use_new_zipfile_serialization=compress)
-        try:
-            save_path.upload_from(tmp.name)
-        except Exception as e:
-            logger.error(f"Error saving to GCP bucket: {e}")
-            raise e
 def save_model_checkpoint(
     model: nn.Module,
     save_path: Union[str, os.PathLike],
@@ -82,7 +61,7 @@ def save_model_checkpoint(
         extention (str): Extension to use for the checkpoint file. Default: "pth".
         **objects_to_save: Additional objects to save, e.g. optimizer state dict, etc.
     """
-    if not is_gcs_path(save_path) and not os.path.exists(os.path.dirname(save_path)):
         raise FileNotFoundError(f"Directory {os.path.dirname(save_path)} does not exist.")
     model_no_ddp = maybe_unwrap_dist_model(model, use_distributed)
@@ -93,8 +72,4 @@ def save_model_checkpoint(
     }
     logger.info("Saving checkpoint to {}.".format(save_path))
-    if is_gcs_path(save_path):
-        torch_save_to_bucket(save_obj, save_path)
-    else:
-        torch.save(save_obj, save_path)

     return state_dict
 def save_model_checkpoint(
     model: nn.Module,
     save_path: Union[str, os.PathLike],
         extention (str): Extension to use for the checkpoint file. Default: "pth".
         **objects_to_save: Additional objects to save, e.g. optimizer state dict, etc.
     """
+    if not os.path.exists(os.path.dirname(save_path)):
         raise FileNotFoundError(f"Directory {os.path.dirname(save_path)} does not exist.")
     model_no_ddp = maybe_unwrap_dist_model(model, use_distributed)
     }
     logger.info("Saving checkpoint to {}.".format(save_path))
+    torch.save(save_obj, save_path)

NatureLM/dataset.py DELETED Viewed

@@ -1,550 +0,0 @@
-# Copyright (2024) Earth Species Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Mixing examples.
-Can mix:
- - base: options-detection add: open-ended:
-    Take all open-ended labels. Add them to the options. Add them to the labels.
-- base: open-ended, add: open-ended
-    Concatenate labels
-"""
-import glob
-import json
-import os
-import random
-from collections import defaultdict
-from pathlib import Path
-from typing import Literal
-import numpy as np
-import soundfile as sf
-import torch
-from torch.nn.utils.rnn import pad_sequence
-from torch.utils.data import Dataset
-from NatureLM.utils import snr_scale, time_scale
-def write_example_to_file(base_filename, audio, sr=16000, suffix="_output", save_dir="debug_outputs"):
-    """
-    Writes the audio tensor to a file for debugging or inspection purposes.
-    Args:
-        base_filename (str): The base name of the original file.
-        audio (torch.Tensor or numpy.ndarray): The audio waveform to save.
-        sr (int): Sampling rate of the audio (default: 16000 Hz).
-        suffix (str): Optional suffix to append to the filename.
-        save_dir (str): Directory where the files will be saved.
-    """
-    if isinstance(audio, torch.Tensor):
-        audio = audio.numpy()  # Convert to numpy if necessary
-    # Ensure the save directory exists
-    os.makedirs(save_dir, exist_ok=True)
-    # Create the output file path
-    filename = f"{os.path.splitext(base_filename)[0]}{suffix}.wav"
-    output_path = os.path.join(save_dir, filename)
-    try:
-        # Write the audio to the file
-        sf.write(output_path, audio, sr)
-        print(f"Saved audio to {output_path}")
-    except Exception as e:
-        print(f"Failed to write audio to file: {e}")
-# Example usage in your code
-# write_example_to_file(os.path.basename(ann["path"]), audio, suffix="_ts")
-def collater(samples):
-    """Collate samples into a batch.
-    Samples is a list of dictionaries, each containing the following keys:
-    - raw_wav: a list of tensors containing the raw audio waveform
-    - text: a list of strings containing the text
-    - task: a list of strings containing the task
-    - id: a list of strings containing the id
-    - prompt: a list of strings containing the prompt
-    - index: a list of integers containing the index
-    The indiviudal audio waveforms will be stacked along the batch dimension for easier
-    processing in the audio model. To keep which audio belongs to which sample, we add
-    the audio_chunk_sizes key to the batch dictionary.
-    """
-    flat_raw_wav = []
-    audio_chunk_sizes = []
-    for s in samples:
-        chunk_size = len(s["raw_wav"])
-        audio_chunk_sizes.append(chunk_size)
-        flat_raw_wav.extend(s["raw_wav"])
-    # raw_wav = [torch.from_numpy(a) for a in flat_raw_wav]
-    raw_wav = flat_raw_wav
-    raw_wav_length = torch.tensor([len(a) for a in raw_wav])
-    raw_wav = pad_sequence(raw_wav, batch_first=True, padding_value=0)
-    paddding_mask = torch.arange(raw_wav.size(1)).unsqueeze(0) >= raw_wav_length.unsqueeze(1)
-    text = [s["text"] for s in samples]
-    prompt = [s["prompt"] for s in samples]
-    task = [s["task"] for s in samples]
-    id = [s["id"] for s in samples]
-    index = [s["index"] for s in samples]
-    return {
-        "raw_wav": raw_wav,
-        "padding_mask": paddding_mask,
-        "text": text,
-        "task": task,
-        "id": id,
-        "prompt": prompt,
-        "index": index,
-        "audio_chunk_sizes": audio_chunk_sizes,
-    }
-class NatureLMDataset(Dataset):
-    def __init__(
-        self,
-        ann_path: str | Path,
-        *,
-        max_length_seconds: int = 10,
-        cropping: Literal["random", "start"] | None = "random",
-        noise_prob: float = 0.0,
-        noise_dirs: list[str] | list[Path] | None = None,
-        low_snr: float = -5,
-        high_snr: float = 20,
-        time_scale_prob: float = 0.0,
-        time_scale: float = 1.2,
-        seed: int = 0,
-        mixup_prob: float = 0.0,
-        mixup_count: int = 3,
-        use_augmentation: bool = False,
-        mask_audio_prob: float = 0.0,
-    ):
-        super().__init__()
-        ann_path = Path(ann_path)
-        if not ann_path.exists():
-            raise FileNotFoundError(f"Dataset file {ann_path} not found")
-        try:
-            with open(ann_path, "r") as f:
-                data = json.load(f)
-                self.annotation = data["annotation"]
-        except (json.JSONDecodeError, KeyError):
-            with open(ann_path, "r") as f:
-                self.annotation = [json.loads(line) for line in f]
-        #### mixup related variables
-        ### hash table for tasks to sample the tasks faster
-        self.tasks = defaultdict(list)
-        for i, ann in enumerate(self.annotation):
-            if "task" in ann and "text" in ann and ann["text"] != "None" and "path" in ann:
-                self.tasks[ann["task"]].append(i)
-        self.mixup_tasks = {
-            task: []
-            for task in self.tasks.keys()
-            if task.endswith("simple-detection")
-            or task.endswith("multiple-detection")  # Add more tasks after validating prompt mixing.
-            or task.endswith("sci-detection-random")
-            or task.endswith("common-detection-random")
-        }
-        for k in self.mixup_tasks.keys():
-            # whichever the base, only mix open-ended tasks.
-            if "sci-" in k:
-                self.mixup_tasks[k] = [
-                    task
-                    for task in self.mixup_tasks.keys()
-                    if task.endswith("sci-simple-detection") or task.endswith("sci-multiple-detection")
-                ]
-            elif "common-" in k:
-                self.mixup_tasks[k] = [
-                    task
-                    for task in self.mixup_tasks.keys()
-                    if task.endswith("common-simple-detection") or task.endswith("common-multiple-detection")
-                ]
-            else:
-                self.mixup_tasks[k] = [task for task in self.mixup_tasks.keys() if "common-" in task]
-        # print("num annotations", len(self.annotation))
-        # print("annotation 0", self.annotation[0])
-        # self.annotation = [a for a in self.annotation if "task" in a and "detection" not in a["task"]] # no detection... :(
-        self.max_length_seconds = max_length_seconds
-        self.cropping = cropping
-        self.use_augmentation = use_augmentation
-        ### noise augmentation
-        self.rng = random.Random(seed)
-        self.rngnp = np.random.default_rng(seed=seed)
-        self.noise_dirs = noise_dirs
-        self.noise_prob = noise_prob
-        self.noise_files = []
-        self.low_snr = low_snr
-        self.high_snr = high_snr
-        self.mask_audio_prob = mask_audio_prob
-        if noise_dirs is not None and len(self.noise_dirs) > 0 and self.use_augmentation:
-            for noise_dir in noise_dirs:
-                noise_from_dir = glob.glob(os.path.join(noise_dir, "*.wav"))
-                if len(noise_from_dir) < 3000:
-                    noise_from_dir = noise_from_dir * 3
-                print("noise files from dir", noise_dir, len(noise_from_dir))
-                self.noise_files.extend(noise_from_dir)
-        ### mixup augmentation
-        self.mixup_prob = mixup_prob
-        self.mixup_count = mixup_count
-        # ### time scale augmentation
-        self.time_scale = time_scale
-        self.time_scale_prob = time_scale_prob
-        # tasks = set([annotation["task"] if "task" in annotation else "empty" for annotation in self.annotation])
-        print(":::all tasks:::", self.tasks.keys())
-        print("num examples", len(self.annotation))
-    def __len__(self):
-        return len(self.annotation)
-    def collater(self, samples):
-        return collater(samples)
-    def load_audio(self, audio_path, shift_allowed: bool, noise_allowed: bool):
-        audio, sr = sf.read(audio_path)
-        # assert sr == 16000
-        if sr != 16000:
-            print("other sr!", sr, audio_path)
-        if len(audio.shape) == 2:  # stereo to mono
-            audio = audio.mean(axis=1)
-        ### time scale augmentation
-        if self.use_augmentation and self.rng.random() < self.time_scale_prob and self.time_scale > 0 and shift_allowed:
-            # print(f"{index} scaling audio")
-            # write_example_to_file(os.path.basename(ann["path"]), audio[: sr * self.max_length_seconds] )
-            audio = time_scale(torch.tensor(audio), scale=self.time_scale, rngnp=self.rngnp).numpy()
-            # write_example_to_file(os.path.basename(ann["path"]), audio[: sr * self.max_length_seconds] , suffix='_ts')
-        # Randomly crop a max_length_seconds window if audio is longer than 10 seconds
-        if len(audio) > sr * self.max_length_seconds and self.cropping == "random":
-            max_start = len(audio) - sr * self.max_length_seconds
-            start = random.randint(0, max_start)
-            audio = audio[start : start + sr * self.max_length_seconds]
-        else:  # no random cropping
-            audio = audio[: sr * self.max_length_seconds]  # Truncate audio to at most max_length_seconds
-        ### noise augmentation
-        audio = torch.tensor(audio)
-        ### noise augmentation
-        if (
-            self.use_augmentation
-            and self.rng.random() < self.noise_prob
-            and len(self.noise_files) > 0
-            and noise_allowed
-        ):
-            # write_example_to_file(os.path.basename(ann["path"]), audio)
-            # print(f"{index} adding noise")
-            noise_file = self.rng.choice(self.noise_files)
-            if not os.path.exists(noise_file):
-                print(f"Warning: noise file {noise_file} does not exist")
-            else:
-                noise_audio, noise_sr = sf.read(noise_file)
-                assert noise_sr == 16000
-                if len(noise_audio.shape) == 2:
-                    noise_audio = noise_audio.mean(axis=1)
-                noise_audio = torch.tensor(noise_audio)
-                ### repeat or trim to the audio size
-                if len(audio) > len(noise_audio):
-                    if len(noise_audio) == 0:
-                        print(
-                            "----- Warning: Noise audio length is zero. ---------- ",
-                            noise_file,
-                        )
-                        # Option 1: Skip noise augmentation by setting noise_audio to zero
-                        noise_audio = torch.zeros_like(audio)
-                    else:
-                        nrepeats = int(np.maximum(2, np.ceil(len(audio) / len(noise_audio))))
-                        noise_audio = noise_audio.repeat(nrepeats)
-                ### Randomly crop the noise file if it is too long
-                if len(noise_audio) > len(audio):
-                    max_start = len(noise_audio) - len(audio)
-                    start = random.randint(0, max_start)
-                    noise_audio = noise_audio[start : start + len(audio)]
-                ### remix with specified snr
-                snr = self.rngnp.uniform(self.low_snr, self.high_snr)
-                snr = torch.tensor([snr])
-                noise_audio = snr_scale(audio, noise_audio, snr)
-                audio = audio + noise_audio
-                # write_example_to_file(os.path.basename(audio_path), audio, suffix='_noise')
-            if len(audio) > self.max_length_seconds * sr:
-                print("long audio", len(audio), len(noise_audio))
-                audio = audio[: self.max_length_seconds * sr]
-        # pad all audios to max_len_seconds in _getitem_ to ensure no padding inconsistencies.
-        if len(audio) < sr * self.max_length_seconds:
-            pad_size = sr * self.max_length_seconds - len(audio)
-            audio = torch.nn.functional.pad(audio, (0, pad_size))
-        audio = torch.clamp(audio, -1.0, 1.0)
-        return audio
-    def _mix_labels(self, text, text_to_mix):
-        """
-        Given two comma-separated label strings (e.g., "gorilla, zebra"),
-        combine them without introducing duplicates. If either is "None",
-        return the other as-is (unless both are "None").
-        """
-        # If `text_to_mix` is explicitly "None", just return `text`.
-        if text_to_mix == "None":
-            return text
-        # If `text` is explicitly "None", just return `text_to_mix`.
-        if text == "None":
-            return text_to_mix
-        # Split both strings by comma, stripping whitespace
-        text_list = [item.strip() for item in text.split(",") if item.strip()]
-        text_to_mix_list = [item.strip() for item in text_to_mix.split(",") if item.strip()]
-        # Deduplicate: add only new items from text_to_mix_list
-        combined_set = set(text_list)
-        for item in text_to_mix_list:
-            if item not in combined_set:
-                text_list.append(item)
-                combined_set.add(item)
-        # If there's nothing left after deduplication, return "None".
-        if not text_list:
-            return "None"
-        # Rejoin them into a comma-separated string
-        return ", ".join(text_list)
-    def _mix_prompts(self, text, text_to_mix, prompt):
-        """
-        If the prompt is in the form:
-            "Which of these, if any, are present in the audio recording? option1, option2, ..."
-        1. Parse out the question (before '?') and the list of prompt choices (after '?').
-        2. Convert both `text` and `text_to_mix` into lists, checking for items not in the prompt.
-        3. Append any missing answers to the prompt choices.
-        4. Shuffle the choices.
-        5. Reassemble and return the new prompt.
-        If the prompt does not follow the expected structure, it is returned unmodified.
-        """
-        # Split into two parts: question + choices
-        splitted = prompt.split("?")
-        if len(splitted) != 2:
-            # If we don't have exactly one question mark segment, just return the original prompt
-            return prompt
-        question = splitted[0].strip()
-        potential_choices_str = splitted[1].strip()
-        # Split the prompt choices
-        if not potential_choices_str:
-            prompt_choices = []
-        else:
-            prompt_choices = [c.strip() for c in potential_choices_str.split(",") if c.strip()]
-        # Parse `text`
-        text_list = [item.strip() for item in text.split(",") if item.strip()]
-        # Parse `text_to_mix`
-        text_to_mix_list = [item.strip() for item in text_to_mix.split(",") if item.strip()]
-        # Add any new items from text_list to the prompt
-        for item in text_list:
-            if item not in prompt_choices:
-                prompt_choices.append(item)
-        # Add any new items from text_to_mix_list to the prompt
-        for item in text_to_mix_list:
-            if item not in prompt_choices:
-                prompt_choices.append(item)
-        # Shuffle consistently with self.rng
-        self.rng.shuffle(prompt_choices)
-        # Reassemble
-        new_prompt = question + "? " + ", ".join(prompt_choices)
-        return new_prompt
-    def _apply_mixup(self, prompt, audio, text, task, filename=None):
-        # mixup_applied = False
-        if (
-            self.use_augmentation and self.rng.random() < self.mixup_prob and task in self.mixup_tasks
-            # and text != "None" # Allow complex 'None' examples.
-        ):
-            # write_example_to_file(os.path.basename(ann["path"]), audio)
-            # print(f"{index} mixing up")
-            mixup_indices = []
-            for pair_task in self.mixup_tasks[task]:
-                mixup_indices.extend(self.tasks[pair_task])
-            # mixup_indices = mixup_indices.remove(index)
-            if len(mixup_indices) == 0:
-                print("No mixup partner found")
-            else:
-                ### choose n_mixup random partners
-                n_mixup = self.rng.randint(1, self.mixup_count)
-                mixup_indices = self.rng.sample(mixup_indices, n_mixup)
-                # print(f"Mixing up with indices {mixup_indices}")
-                for mixup_index in mixup_indices:
-                    mixup_ann = self.annotation[mixup_index]
-                    mixup_audio, _ = sf.read(mixup_ann["path"])
-                    if len(mixup_audio.shape) == 2:
-                        mixup_audio = mixup_audio.mean(axis=1)
-                    mixup_audio = mixup_audio[: len(audio)]
-                    if len(mixup_audio) < len(audio):
-                        pad_size = len(audio) - len(mixup_audio)
-                        mixup_audio = np.pad(mixup_audio, (0, pad_size), mode="constant")
-                    mixup_audio = torch.from_numpy(mixup_audio).float()
-                    lam = np.clip(self.rngnp.beta(1.0, 1.0), 0.1, 0.8)
-                    # Mix the raw_wav
-                    audio = lam * audio + (1 - lam) * mixup_audio
-                    ### Mix the prompts if the labels are given in prompts
-                    if text in prompt:
-                        prompt = self._mix_prompts(text, mixup_ann["text"], prompt)
-                    ### Mix the labels
-                    text = self._mix_labels(text, mixup_ann["text"])
-                # mixup_applied = True
-        # DEBUG: If mixup was actually applied, save the final audio
-        # if mixup_applied and filename is not None:
-        #     # Just add a suffix to the original filename to indicate mixup
-        #     base_filename = os.path.basename(filename)
-        #     write_example_to_file(
-        #         base_filename=base_filename,
-        #         audio=audio,
-        #         sr=16000,
-        #         suffix="_mixup",
-        #         save_dir="mixup_outputs"
-        #     )
-        #     print(f"mixup for {filename}::: prompt {prompt} label {text}")
-        return prompt, audio, text
-    def _load_noise(self, shift_allowed: bool):
-        noise_file = self.rng.choice(self.noise_files)
-        noise_audio, noise_sr = sf.read(noise_file)
-        assert noise_sr == 16000, f"Expected noise sample rate 16000, got {noise_sr}"
-        if len(noise_audio.shape) == 2:
-            noise_audio = noise_audio.mean(axis=1)
-        # Time scale augmentation if applicable
-        if self.use_augmentation and self.rng.random() < self.time_scale_prob and self.time_scale > 0 and shift_allowed:
-            noise_audio = time_scale(torch.tensor(noise_audio), scale=self.time_scale, rngnp=self.rngnp).numpy()
-        # Randomly crop or pad to match max_length_seconds
-        if len(noise_audio) > self.max_length_seconds * 16000 and self.cropping == "random":
-            max_start = len(noise_audio) - self.max_length_seconds * 16000
-            start = random.randint(0, max_start)
-            noise_audio = noise_audio[start : start + self.max_length_seconds * 16000]
-        else:
-            noise_audio = noise_audio[: self.max_length_seconds * 16000]
-        # Pad if needed
-        if len(noise_audio) < self.max_length_seconds * 16000:
-            pad_size = self.max_length_seconds * 16000 - len(noise_audio)
-            noise_audio = np.pad(noise_audio, (0, pad_size), mode="constant")
-        noise_audio = torch.tensor(noise_audio).float()
-        noise_audio = torch.clamp(noise_audio, -1.0, 1.0)
-        return noise_audio
-    def __getitem__(self, index):
-        ann = self.annotation[index]
-        # print("loading audio::", ann)
-        shift_allowed = "pitch" not in ann.get("task", "")
-        noise_allowed = (
-            "/A/" not in ann.get("path", "")
-            and "-qa" not in ann.get("task", "")
-            and "icl" not in ann.get("task", "")
-            and "caption" not in ann.get("task", "")
-            and "animal-instructions" not in ann.get("task", "")
-        )
-        task = ann.get("task", "asr")
-        text = ann["text"]
-        prompt = ann["prompt"]
-        replace_with_noise = (
-            self.use_augmentation
-            and task.endswith("detection")
-            and self.rng.random() < self.mask_audio_prob
-            and len(self.noise_files) > 0
-        )
-        if replace_with_noise:
-            # Replace audio with noise
-            audio = self._load_noise(shift_allowed)
-            audios = [audio]
-            text = "None"
-        else:
-            if "path" in ann and ann["path"] is not None:
-                audio = self.load_audio(ann["path"], shift_allowed, noise_allowed)
-                audios = [audio]
-            else:
-                audios = [self.load_audio(p, shift_allowed, noise_allowed) for p in ann["files"]]
-            if len(audios) == 1:
-                prompt, mixed_audio, text = self._apply_mixup(prompt, audio, text, task, filename=ann["path"])
-                audios = [mixed_audio]
-        return {
-            "raw_wav": audios,
-            "text": text,
-            "task": task,
-            "id": ann.get("path") or ";".join(ann["files"]),
-            "prompt": prompt,
-            "index": index,  # track which element for eval output
-            "ann": ann,  # Include annotation for mixup
-        }
-if __name__ == "__main__":
-    dataset = NatureLMDataset(
-        ann_path="/home/ubuntu/foundation-model-storage/foundation-model-data/data/compiled-datasets/v1/s2_eval_valid.jsonl",
-        noise_dirs=["resource/audio_demo"],
-        max_length_seconds=10,
-        use_augmentation=True,
-        mixup_prob=1.0,  # For demonstration, force mixup if possible
-        mixup_count=2,  # Up to 2 mixup partners
-        mask_audio_prob=0.2,
-        seed=42,
-        noise_prob=0.5,
-    )
-    # Process just a few to see the saved mixups
-    for i in range(300):
-        sample = dataset[i]
-        # print("Final text:", sample["text"])
-        # print("Final prompt:", sample["prompt"])
-        # print("-" * 40)
-    print("Done! Look in 'debug_outputs' folder for saved mixup files.")

NatureLM/dist_utils.py DELETED Viewed

@@ -1,109 +0,0 @@
-"""
-Adapted from salesforce@LAVIS. Below is the original copyright:
- Copyright (c) 2022, salesforce.com, inc.
- All rights reserved.
- SPDX-License-Identifier: BSD-3-Clause
- For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
-"""
-import datetime
-import functools
-import os
-import torch
-import torch.distributed as dist
-def setup_for_distributed(is_master):
-    """
-    This function disables printing when not in master process
-    """
-    import builtins as __builtin__
-    builtin_print = __builtin__.print
-    def print(*args, **kwargs):
-        force = kwargs.pop("force", False)
-        if is_master or force:
-            builtin_print(*args, **kwargs)
-    __builtin__.print = print
-def is_dist_avail_and_initialized():
-    if not dist.is_available():
-        return False
-    if not dist.is_initialized():
-        return False
-    return True
-def get_world_size():
-    if not is_dist_avail_and_initialized():
-        return 1
-    return dist.get_world_size()
-def get_rank():
-    if not is_dist_avail_and_initialized():
-        return 0
-    return dist.get_rank()
-def is_main_process():
-    return get_rank() == 0
-def init_distributed_mode(args):
-    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
-        args.rank = int(os.environ["RANK"])
-        args.world_size = int(os.environ["WORLD_SIZE"])
-        args.gpu = int(os.environ["LOCAL_RANK"])
-    elif "SLURM_PROCID" in os.environ:
-        args.rank = int(os.environ["SLURM_PROCID"])
-        args.gpu = args.rank % torch.cuda.device_count()
-    else:
-        print("Not using distributed mode")
-        args.use_distributed = False
-        return
-    args.use_distributed = True
-    torch.cuda.set_device(args.gpu)
-    print(
-        "| distributed init (rank {}, world {}): {}".format(args.rank, args.world_size, args.dist_url),
-        flush=True,
-    )
-    torch.distributed.init_process_group(
-        backend=args.dist_backend,
-        init_method=args.dist_url,
-        world_size=args.world_size,
-        rank=args.rank,
-        timeout=datetime.timedelta(days=365),  # allow auto-downloading and de-compressing
-    )
-    torch.distributed.barrier()
-    setup_for_distributed(args.rank == 0)
-def get_dist_info():
-    if torch.__version__ < "1.0":
-        initialized = dist._initialized
-    else:
-        initialized = dist.is_initialized()
-    if initialized:
-        rank = dist.get_rank()
-        world_size = dist.get_world_size()
-    else:  # non-distributed training
-        rank = 0
-        world_size = 1
-    return rank, world_size
-def main_process(func):
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        rank, _ = get_dist_info()
-        if rank == 0:
-            return func(*args, **kwargs)
-    return wrapper

NatureLM/infer.py CHANGED Viewed

@@ -5,7 +5,7 @@ from pathlib import Path
 import numpy as np
 import pandas as pd
-import soundfile as sf
 import torch
 from NatureLM.config import Config
@@ -16,10 +16,15 @@ from NatureLM.utils import move_to_device
 _MAX_LENGTH_SECONDS = 10
 _MIN_CHUNK_LENGTH_SECONDS = 0.5
 _SAMPLE_RATE = 16000  # Assuming the model uses a sample rate of 16kHz
-_AUDIO_FILE_EXTENSIONS = [".wav", ".mp3", ".flac", ".ogg"]  # Add other audio file formats as needed
-_DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
-__this_dir = Path(__file__).parent.parent
-_DEFAULT_CONFIG_PATH = __this_dir / "configs" / "inference.yml"
 def load_model_and_config(
@@ -32,7 +37,9 @@ def load_model_and_config(
     model = NatureLM.from_pretrained("EarthSpeciesProject/NatureLM-audio")
     model = model.to(device).eval()
     model.llama_tokenizer.pad_token_id = model.llama_tokenizer.eos_token_id
-    model.llama_model.generation_config.pad_token_id = model.llama_tokenizer.pad_token_id
     cfg = Config.from_sources(cfg_path)
     return model, cfg
@@ -53,7 +60,7 @@ def sliding_window_inference(
     hop_length_seconds: float = 10.0,
     input_sr: int = _SAMPLE_RATE,
     device: str = _DEVICE,
-) -> str:
     """Run inference on a long audio file using sliding window approach.
     Args:
@@ -73,7 +80,7 @@ def sliding_window_inference(
         ValueError: If the audio file is too short or if the audio file path is invalid.
     """
     if isinstance(audio, str) or isinstance(audio, Path):
-        audio_array, input_sr = sf.read(str(audio))
     elif isinstance(audio, np.ndarray):
         audio_array = audio
         print(f"Using provided sample rate: {input_sr}")
@@ -86,13 +93,16 @@ def sliding_window_inference(
     # Do initial check that the audio is long enough
     if audio_array.shape[-1] < int(_MIN_CHUNK_LENGTH_SECONDS * input_sr):
-        raise ValueError(f"Audio is too short. Minimum length is {_MIN_CHUNK_LENGTH_SECONDS} seconds.")
     start = 0
     stride = int(hop_length_seconds * input_sr)
     window_length = int(window_length_seconds * input_sr)
-    output = ""
     while True:
         chunk = audio_array[start : start + window_length]
         if chunk.shape[-1] < int(_MIN_CHUNK_LENGTH_SECONDS * input_sr):
@@ -113,8 +123,16 @@ def sliding_window_inference(
         prediction: str = model.generate(input_to_model, cfg.generate, prompt_list)[0]
         # Post-process the prediction
-        prediction = output_template(prediction, start / input_sr, (start + window_length) / input_sr)
-        output += prediction
         # Move the window
         start += stride
@@ -128,7 +146,9 @@ def sliding_window_inference(
 class Pipeline:
     """Pipeline for running NatureLM-audio inference on a list of audio files or audio arrays"""
-    def __init__(self, model: NatureLM = None, cfg_path: str | Path = _DEFAULT_CONFIG_PATH):
         self.cfg_path = cfg_path
         # Load model and config
@@ -139,7 +159,9 @@ class Pipeline:
             # Download model from hub
             self.model, self.cfg = load_model_and_config(cfg_path)
-        self.processor = NatureLMAudioProcessor(sample_rate=_SAMPLE_RATE, max_length_seconds=_MAX_LENGTH_SECONDS)
     def __call__(
         self,
@@ -149,6 +171,7 @@ class Pipeline:
         hop_length_seconds: float = 10.0,
         input_sample_rate: int = _SAMPLE_RATE,
         verbose: bool = False,
     ) -> list[str]:
         """Run inference on a list of audio file paths or a single audio file with a
         single query or a list of queries. If multiple queries are provided,
@@ -165,18 +188,11 @@ class Pipeline:
             Defaults to False.
         Returns:
-            str | list[str]: The output of the model..
         Raises:
             ValueError: If the number of audio files and queries do not match.
-        Example:
-            >>> pipeline = Pipeline()
-            >>> audios = ["assets/nri-GreenTreeFrogEvergladesNP.mp3"]
-            >>> queries = ["Which species is this? Provide the common name."]
-            >>> results = pipeline(audios, queries)
-            >>> print(results)
-            ['#0.00s - 10.00s#: Green Treefrog\n']
         """
         if isinstance(audios, str) or isinstance(audios, Path):
             audios = [audios]
@@ -189,7 +205,10 @@ class Pipeline:
         # Run inference
         results = []
-        for audio, query in zip(audios, queries):
             output = sliding_window_inference(
                 audio,
                 query,
@@ -209,21 +228,38 @@ class Pipeline:
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser("Run NatureLM-audio inference")
     parser.add_argument(
-        "-a", "--audio", type=str, required=True, help="Path to an audio file or a directory containing audio files"
     )
-    parser.add_argument("-q", "--query", type=str, required=True, help="Query for the model")
     parser.add_argument(
         "--cfg-path",
         type=str,
         default="configs/inference.yml",
         help="Path to the configuration file for the model",
     )
-    parser.add_argument("--output_path", type=str, default="inference_output.jsonl", help="Output path for the results")
     parser.add_argument(
-        "--window_length_seconds", type=float, default=10.0, help="Length of the sliding window in seconds"
     )
     parser.add_argument(
-        "--hop_length_seconds", type=float, default=10.0, help="Hop length for the sliding window in seconds"
     )
     args = parser.parse_args()
@@ -261,7 +297,9 @@ def main(
     audio_path = Path(audio_path)
     if audio_path.is_dir():
         audio_paths = []
-        print(f"Searching for audio files in {str(audio_path)} with extensions {', '.join(_AUDIO_FILE_EXTENSIONS)}")
         for ext in _AUDIO_FILE_EXTENSIONS:
             audio_paths.extend(list(audio_path.rglob(f"*{ext}")))
@@ -278,18 +316,30 @@ def main(
     if not query:
         raise ValueError("Query cannot be empty")
     if not audio_paths:
-        raise ValueError("No audio files found. Please check the path or file extensions.")
     # Load model and config
     model, cfg = load_model_and_config(cfg_path)
     # Load audio processor
-    processor = NatureLMAudioProcessor(sample_rate=_SAMPLE_RATE, max_length_seconds=_MAX_LENGTH_SECONDS)
     # Run inference
     results = {"audio_path": [], "output": []}
     for path in audio_paths:
-        output = sliding_window_inference(path, query, processor, model, cfg, window_length_seconds, hop_length_seconds)
         results["audio_path"].append(str(path))
         results["output"].append(output)
         print(f"Processed {path}, model output:\n=======\n{output}\n=======\n")

 import numpy as np
 import pandas as pd
+import librosa
 import torch
 from NatureLM.config import Config
 _MAX_LENGTH_SECONDS = 10
 _MIN_CHUNK_LENGTH_SECONDS = 0.5
 _SAMPLE_RATE = 16000  # Assuming the model uses a sample rate of 16kHz
+_AUDIO_FILE_EXTENSIONS = [
+    ".wav",
+    ".mp3",
+    ".flac",
+    ".ogg",
+]  # Add other audio file formats as needed
+_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+__root_dir = Path(__file__).parent.parent
+_DEFAULT_CONFIG_PATH = __root_dir / "configs" / "inference.yml"
 def load_model_and_config(
     model = NatureLM.from_pretrained("EarthSpeciesProject/NatureLM-audio")
     model = model.to(device).eval()
     model.llama_tokenizer.pad_token_id = model.llama_tokenizer.eos_token_id
+    model.llama_model.generation_config.pad_token_id = (
+        model.llama_tokenizer.pad_token_id
+    )
     cfg = Config.from_sources(cfg_path)
     return model, cfg
     hop_length_seconds: float = 10.0,
     input_sr: int = _SAMPLE_RATE,
     device: str = _DEVICE,
+) -> list[dict[str, any]]:
     """Run inference on a long audio file using sliding window approach.
     Args:
         ValueError: If the audio file is too short or if the audio file path is invalid.
     """
     if isinstance(audio, str) or isinstance(audio, Path):
+        audio_array, input_sr = librosa.load(str(audio), sr=None, mono=False)
     elif isinstance(audio, np.ndarray):
         audio_array = audio
         print(f"Using provided sample rate: {input_sr}")
     # Do initial check that the audio is long enough
     if audio_array.shape[-1] < int(_MIN_CHUNK_LENGTH_SECONDS * input_sr):
+        raise ValueError(
+            f"Audio is too short. Minimum length is {_MIN_CHUNK_LENGTH_SECONDS} seconds."
+        )
     start = 0
     stride = int(hop_length_seconds * input_sr)
     window_length = int(window_length_seconds * input_sr)
+    window_id = 0
+    output = []  # Initialize output list
     while True:
         chunk = audio_array[start : start + window_length]
         if chunk.shape[-1] < int(_MIN_CHUNK_LENGTH_SECONDS * input_sr):
         prediction: str = model.generate(input_to_model, cfg.generate, prompt_list)[0]
         # Post-process the prediction
+        # prediction = output_template(prediction, start / input_sr, (start + window_length) / input_sr)
+        # output += prediction
+        output.append(
+            {
+                "start_time": start / input_sr,
+                "end_time": (start + window_length) / input_sr,
+                "prediction": prediction,
+                "window_number": window_id,
+            }
+        )
         # Move the window
         start += stride
 class Pipeline:
     """Pipeline for running NatureLM-audio inference on a list of audio files or audio arrays"""
+    def __init__(
+        self, model: NatureLM = None, cfg_path: str | Path = _DEFAULT_CONFIG_PATH
+    ):
         self.cfg_path = cfg_path
         # Load model and config
             # Download model from hub
             self.model, self.cfg = load_model_and_config(cfg_path)
+        self.processor = NatureLMAudioProcessor(
+            sample_rate=_SAMPLE_RATE, max_length_seconds=_MAX_LENGTH_SECONDS
+        )
     def __call__(
         self,
         hop_length_seconds: float = 10.0,
         input_sample_rate: int = _SAMPLE_RATE,
         verbose: bool = False,
+        progress_bar=None,
     ) -> list[str]:
         """Run inference on a list of audio file paths or a single audio file with a
         single query or a list of queries. If multiple queries are provided,
             Defaults to False.
         Returns:
+            list[list[dict]]: List of model outputs for each audio file. Each output is a list of dictionaries
+            containing the start time, end time, and prediction for each chunk of audio.
         Raises:
             ValueError: If the number of audio files and queries do not match.
         """
         if isinstance(audios, str) or isinstance(audios, Path):
             audios = [audios]
         # Run inference
         results = []
+        progress_bar(0, desc="Starting")
+        for audio, query in progress_bar.tqdm(
+            zip(audios, queries), desc="Generating responses", total=len(audios)
+        ):
             output = sliding_window_inference(
                 audio,
                 query,
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser("Run NatureLM-audio inference")
     parser.add_argument(
+        "-a",
+        "--audio",
+        type=str,
+        required=True,
+        help="Path to an audio file or a directory containing audio files",
+    )
+    parser.add_argument(
+        "-q", "--query", type=str, required=True, help="Query for the model"
     )
     parser.add_argument(
         "--cfg-path",
         type=str,
         default="configs/inference.yml",
         help="Path to the configuration file for the model",
     )
     parser.add_argument(
+        "--output_path",
+        type=str,
+        default="inference_output.jsonl",
+        help="Output path for the results",
+    )
+    parser.add_argument(
+        "--window_length_seconds",
+        type=float,
+        default=10.0,
+        help="Length of the sliding window in seconds",
     )
     parser.add_argument(
+        "--hop_length_seconds",
+        type=float,
+        default=10.0,
+        help="Hop length for the sliding window in seconds",
     )
     args = parser.parse_args()
     audio_path = Path(audio_path)
     if audio_path.is_dir():
         audio_paths = []
+        print(
+            f"Searching for audio files in {str(audio_path)} with extensions {', '.join(_AUDIO_FILE_EXTENSIONS)}"
+        )
         for ext in _AUDIO_FILE_EXTENSIONS:
             audio_paths.extend(list(audio_path.rglob(f"*{ext}")))
     if not query:
         raise ValueError("Query cannot be empty")
     if not audio_paths:
+        raise ValueError(
+            "No audio files found. Please check the path or file extensions."
+        )
     # Load model and config
     model, cfg = load_model_and_config(cfg_path)
     # Load audio processor
+    processor = NatureLMAudioProcessor(
+        sample_rate=_SAMPLE_RATE, max_length_seconds=_MAX_LENGTH_SECONDS
+    )
     # Run inference
     results = {"audio_path": [], "output": []}
     for path in audio_paths:
+        output = sliding_window_inference(
+            path,
+            query,
+            processor,
+            model,
+            cfg,
+            window_length_seconds,
+            hop_length_seconds,
+        )
         results["audio_path"].append(str(path))
         results["output"].append(output)
         print(f"Processed {path}, model output:\n=======\n{output}\n=======\n")

NatureLM/logger.py DELETED Viewed

@@ -1,190 +0,0 @@
-import datetime
-import logging
-import time
-from collections import defaultdict, deque
-import torch
-import torch.distributed as dist
-import wandb
-from NatureLM.dist_utils import is_dist_avail_and_initialized, is_main_process
-class SmoothedValue(object):
-    """Track a series of values and provide access to smoothed values over a
-    window or the global series average.
-    """
-    def __init__(self, window_size=20, fmt=None):
-        if fmt is None:
-            fmt = "{median:.4f} ({global_avg:.4f})"
-        self.deque = deque(maxlen=window_size)
-        self.total = 0.0
-        self.count = 0
-        self.fmt = fmt
-    def update(self, value, n=1):
-        self.deque.append(value)
-        self.count += n
-        self.total += value * n
-    def synchronize_between_processes(self):
-        """
-        Warning: does not synchronize the deque!
-        """
-        if not is_dist_avail_and_initialized():
-            return
-        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
-        dist.barrier()
-        dist.all_reduce(t)
-        t = t.tolist()
-        self.count = int(t[0])
-        self.total = t[1]
-    @property
-    def median(self):
-        d = torch.tensor(list(self.deque))
-        return d.median().item()
-    @property
-    def avg(self):
-        d = torch.tensor(list(self.deque), dtype=torch.float32)
-        return d.mean().item()
-    @property
-    def global_avg(self):
-        return self.total / self.count
-    @property
-    def max(self):
-        return max(self.deque)
-    @property
-    def value(self):
-        return self.deque[-1]
-    def __str__(self):
-        return self.fmt.format(
-            median=self.median,
-            avg=self.avg,
-            global_avg=self.global_avg,
-            max=self.max,
-            value=self.value,
-        )
-class MetricLogger(object):
-    def __init__(self, delimiter="\t"):
-        self.meters = defaultdict(SmoothedValue)
-        self.delimiter = delimiter
-    def update(self, **kwargs):
-        for k, v in kwargs.items():
-            if isinstance(v, torch.Tensor):
-                v = v.item()
-            assert isinstance(v, (float, int))
-            self.meters[k].update(v)
-    def __getattr__(self, attr):
-        if attr in self.meters:
-            return self.meters[attr]
-        if attr in self.__dict__:
-            return self.__dict__[attr]
-        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
-    def __str__(self):
-        loss_str = []
-        for name, meter in self.meters.items():
-            loss_str.append("{}: {}".format(name, str(meter)))
-        return self.delimiter.join(loss_str)
-    def global_avg(self):
-        loss_str = []
-        for name, meter in self.meters.items():
-            loss_str.append("{}: {:.4f}".format(name, meter.global_avg))
-        return self.delimiter.join(loss_str)
-    def synchronize_between_processes(self):
-        for meter in self.meters.values():
-            meter.synchronize_between_processes()
-    def add_meter(self, name, meter):
-        self.meters[name] = meter
-    def log_every(self, iterable, print_freq, header=None, logger=None, start_step=None):
-        i = 0
-        if not header:
-            header = ""
-        start_time = time.time()
-        end = time.time()
-        iter_time = SmoothedValue(fmt="{avg:.4f}")
-        data_time = SmoothedValue(fmt="{avg:.4f}")
-        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
-        log_msg = [
-            header,
-            "[{0" + space_fmt + "}/{1}]",
-            "eta: {eta}",
-            "{meters}",
-            "time: {time}",
-            "data: {data}",
-        ]
-        if torch.cuda.is_available():
-            log_msg.append("max mem: {memory:.0f}")
-        log_msg = self.delimiter.join(log_msg)
-        MB = 1024.0 * 1024.0
-        for obj in iterable:
-            data_time.update(time.time() - end)
-            yield obj
-            iter_time.update(time.time() - end)
-            if i % print_freq == 0 or i == len(iterable) - 1:
-                if is_main_process():
-                    if logger is not None:
-                        assert start_step is not None, "start_step is needed to compute global_step!"
-                        for name, meter in self.meters.items():
-                            logger.add_scalar("{}".format(name), float(str(meter)), global_step=start_step + i)
-                        # Log to wandb
-                        wandb.log({name: float(str(meter)) for name, meter in self.meters.items()}, step=start_step + i)
-                eta_seconds = iter_time.global_avg * (len(iterable) - i)
-                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
-                if torch.cuda.is_available():
-                    print(
-                        log_msg.format(
-                            i,
-                            len(iterable),
-                            eta=eta_string,
-                            meters=str(self),
-                            time=str(iter_time),
-                            data=str(data_time),
-                            memory=torch.cuda.max_memory_allocated() / MB,
-                        )
-                    )
-                else:
-                    print(
-                        log_msg.format(
-                            i,
-                            len(iterable),
-                            eta=eta_string,
-                            meters=str(self),
-                            time=str(iter_time),
-                            data=str(data_time),
-                        )
-                    )
-            i += 1
-            end = time.time()
-        total_time = time.time() - start_time
-        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
-        print("{} Total time: {} ({:.4f} s / it)".format(header, total_time_str, total_time / len(iterable)))
-class AttrDict(dict):
-    def __init__(self, *args, **kwargs):
-        super(AttrDict, self).__init__(*args, **kwargs)
-        self.__dict__ = self
-def setup_logger():
-    logging.basicConfig(
-        level=logging.INFO if is_main_process() else logging.WARN,
-        format="%(asctime)s [%(levelname)s] %(message)s",
-        handlers=[logging.StreamHandler()],
-    )

NatureLM/models/NatureLM.py CHANGED Viewed

@@ -645,7 +645,7 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
         stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
         with torch.autocast(self.device.type, dtype=torch.bfloat16):
-            outputs = self.llama_model.generate(  # TODO: Wrap the llama_model with outlines https://outlines-dev.github.io/outlines/reference/models/transformers/
                 inputs_embeds=embeds.bfloat16(),
                 max_new_tokens=generate_cfg.max_new_tokens,
                 stopping_criteria=stopping_criteria,

         stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
         with torch.autocast(self.device.type, dtype=torch.bfloat16):
+            outputs = self.llama_model.generate(
                 inputs_embeds=embeds.bfloat16(),
                 max_new_tokens=generate_cfg.max_new_tokens,
                 stopping_criteria=stopping_criteria,

NatureLM/optims.py DELETED Viewed

@@ -1,154 +0,0 @@
-# This script is from https://github.com/salesforce/LAVIS/blob/main/lavis/common/optims.py
-import logging
-import math
-import torch
-from NatureLM.config import OptimizerConfig
-class LinearWarmupStepLRScheduler:
-    def __init__(
-        self,
-        optimizer,
-        max_epoch,
-        min_lr,
-        init_lr,
-        decay_rate=1,
-        warmup_start_lr=-1,
-        warmup_steps=0,
-        **kwargs,
-    ):
-        self.optimizer = optimizer
-        self.max_epoch = max_epoch
-        self.min_lr = min_lr
-        self.decay_rate = decay_rate
-        self.init_lr = init_lr
-        self.warmup_steps = warmup_steps
-        self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr
-    def step(self, cur_epoch, cur_step):
-        if cur_epoch == 0:
-            warmup_lr_schedule(
-                step=cur_step,
-                optimizer=self.optimizer,
-                max_step=self.warmup_steps,
-                init_lr=self.warmup_start_lr,
-                max_lr=self.init_lr,
-            )
-        else:
-            step_lr_schedule(
-                epoch=cur_epoch,
-                optimizer=self.optimizer,
-                init_lr=self.init_lr,
-                min_lr=self.min_lr,
-                decay_rate=self.decay_rate,
-            )
-class LinearWarmupCosineLRScheduler:
-    def __init__(
-        self,
-        optimizer,
-        max_epoch,
-        iters_per_epoch,
-        min_lr,
-        init_lr,
-        warmup_steps=0,
-        warmup_start_lr=-1,
-        **kwargs,
-    ):
-        self.optimizer = optimizer
-        self.max_epoch = max_epoch
-        self.iters_per_epoch = iters_per_epoch
-        self.min_lr = min_lr
-        self.init_lr = init_lr
-        self.warmup_steps = warmup_steps
-        self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr
-    def step(self, cur_epoch, cur_step):
-        total_cur_step = cur_epoch * self.iters_per_epoch + cur_step
-        if total_cur_step < self.warmup_steps:
-            warmup_lr_schedule(
-                step=cur_step,
-                optimizer=self.optimizer,
-                max_step=self.warmup_steps,
-                init_lr=self.warmup_start_lr,
-                max_lr=self.init_lr,
-            )
-        else:
-            cosine_lr_schedule(
-                epoch=total_cur_step,
-                optimizer=self.optimizer,
-                max_epoch=self.max_epoch * self.iters_per_epoch,
-                init_lr=self.init_lr,
-                min_lr=self.min_lr,
-            )
-def cosine_lr_schedule(optimizer, epoch, max_epoch, init_lr, min_lr):
-    """Decay the learning rate"""
-    lr = (init_lr - min_lr) * 0.5 * (1.0 + math.cos(math.pi * epoch / max_epoch)) + min_lr
-    for param_group in optimizer.param_groups:
-        param_group["lr"] = lr
-def warmup_lr_schedule(optimizer, step, max_step, init_lr, max_lr):
-    """Warmup the learning rate"""
-    lr = min(max_lr, init_lr + (max_lr - init_lr) * step / max(max_step, 1))
-    for param_group in optimizer.param_groups:
-        param_group["lr"] = lr
-def step_lr_schedule(optimizer, epoch, init_lr, min_lr, decay_rate):
-    """Decay the learning rate"""
-    lr = max(min_lr, init_lr * (decay_rate**epoch))
-    for param_group in optimizer.param_groups:
-        param_group["lr"] = lr
-def get_optimizer(model, config: OptimizerConfig):
-    num_parameters = 0
-    p_wd, p_non_wd = [], []
-    for n, p in model.named_parameters():
-        if not p.requires_grad:
-            continue  # frozen weights
-        print(n)
-        if p.ndim < 2 or "bias" in n or "ln" in n or "bn" in n:
-            p_non_wd.append(p)
-        else:
-            p_wd.append(p)
-        num_parameters += p.data.nelement()
-    logging.info("number of trainable parameters: %d" % num_parameters)
-    optim_params = [
-        {
-            "params": p_wd,
-            "weight_decay": float(config.weight_decay),
-        },
-        {"params": p_non_wd, "weight_decay": 0},
-    ]
-    beta2 = config.beta2
-    if config.device == "cpu":
-        optimizer = torch.optim.AdamW(
-            optim_params,
-            lr=float(config.init_lr),
-            weight_decay=float(config.weight_decay),
-            betas=(0.9, beta2),
-        )
-    else:
-        import bitsandbytes as bnb
-        optimizer = bnb.optim.PagedAdamW8bit(
-            optim_params,
-            lr=float(config.init_lr),
-            weight_decay=float(config.weight_decay),
-            betas=(0.9, beta2),
-        )
-    return optimizer

NatureLM/processors.py CHANGED Viewed

@@ -6,7 +6,7 @@ from dataclasses import dataclass, field
 import numpy as np
 import resampy
-import soundfile as sf
 import torch
@@ -49,7 +49,7 @@ class NatureLMAudioProcessor:
     def prepare_audio(self, audio: list[float] | np.ndarray | os.PathLike, input_sr: int = None) -> torch.Tensor:
         """Prepare an audio array or file path for inference"""
         if isinstance(audio, str | os.PathLike):
-            audio, sr = sf.read(audio)
             input_sr = sr
         elif isinstance(audio, list):
             audio = np.array(audio)

 import numpy as np
 import resampy
+import librosa
 import torch
     def prepare_audio(self, audio: list[float] | np.ndarray | os.PathLike, input_sr: int = None) -> torch.Tensor:
         """Prepare an audio array or file path for inference"""
         if isinstance(audio, str | os.PathLike):
+            audio, sr = librosa.load(audio, sr=None, mono=False)
             input_sr = sr
         elif isinstance(audio, list):
             audio = np.array(audio)

NatureLM/runner.py DELETED Viewed

@@ -1,515 +0,0 @@
-# This script is based on https://github.com/salesforce/LAVIS/blob/main/lavis/runners/runner_base.py
-import datetime
-import json
-import logging
-import os
-import time
-from collections import defaultdict
-from pathlib import Path
-import torch
-import torch.distributed
-import torch.distributed as dist
-import wandb
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.utils.tensorboard import SummaryWriter
-from NatureLM.config import Config
-from NatureLM.dist_utils import get_rank, get_world_size, is_dist_avail_and_initialized, is_main_process, main_process
-from NatureLM.logger import MetricLogger, SmoothedValue
-from NatureLM.optims import LinearWarmupCosineLRScheduler, get_optimizer
-from NatureLM.task_metrics import get_task_metrics
-from NatureLM.utils import get_dataloader, prepare_sample_dist
-class Runner:
-    def __init__(self, cfg: Config, model, datasets, job_id):
-        self.config = cfg
-        # log
-        device = "cuda:0"
-        if is_main_process():
-            if self.config.run.wandb_enabled:
-                wandb.init(project="earthlm", config=self.config.model_dump())
-            else:
-                wandb.init(mode="disabled")
-        if "LOCAL_RANK" in os.environ:
-            device = int(os.environ["LOCAL_RANK"])
-        else:
-            device = self.config.run.device
-        print(f"device is {device} could have been {self.config.run.device}")
-        self.output_dir = Path(self.config.run.output_dir) / job_id
-        self.output_dir.mkdir(parents=True, exist_ok=True)
-        self.log_writter = SummaryWriter(self.output_dir)
-        # settings
-        self.device = torch.device(device)
-        self.use_distributed = self.config.run.use_distributed
-        self.start_epoch = 0
-        self.max_epoch = self.config.run.optims.max_epoch
-        self.evaluate_only = self.config.run.evaluate
-        self.cuda_enabled = self.device.type == "cuda"
-        # test prompt
-        self.prompt_template = self.config.model.prompt_template
-        # model
-        self._model = model
-        torch.nn.SyncBatchNorm.convert_sync_batchnorm(self._model)
-        self._model.to(self.device)
-        if self.use_distributed:
-            self.model = DDP(
-                self._model,
-                find_unused_parameters=True,
-                static_graph=False,
-                device_ids=[self.device],
-            )
-        else:
-            self.model = self._model
-        # dataloaders
-        self.train_loader = get_dataloader(
-            datasets["train"],
-            self.config.run,
-            is_train=True,
-            use_distributed=self.use_distributed,
-        )
-        self.valid_loader = get_dataloader(
-            datasets["valid"],
-            self.config.run,
-            is_train=False,
-            use_distributed=self.use_distributed,
-        )
-        self.test_loader = get_dataloader(
-            datasets["test"],
-            self.config.run,
-            is_train=False,
-            use_distributed=self.use_distributed,
-        )
-        # scaler
-        self.use_amp = self.config.run.amp
-        if self.use_amp:
-            self.scaler = torch.cuda.amp.GradScaler()
-        else:
-            self.scaler = None
-        # optimizer & scheduler
-        self.iters_per_epoch = (
-            len(self.train_loader) if self.config.run.epoch_based else self.config.run.iters_per_epoch
-        )
-        self.optimizer = get_optimizer(self.model, self.config.run.optims)
-        self.scheduler = LinearWarmupCosineLRScheduler(
-            self.optimizer,
-            max_epoch=self.max_epoch,
-            iters_per_epoch=self.iters_per_epoch,
-            min_lr=self.config.run.optims.min_lr,
-            init_lr=self.config.run.optims.init_lr,
-            warmup_steps=self.config.run.optims.warmup_steps,
-            warmup_start_lr=self.config.run.optims.warmup_start_lr,
-        )
-        #### augmentations
-        # self.rng = random.Random(self.config.run.seed)
-        # self.rngnp = np.random.default_rng(seed=self.config.run.seed)
-        # self.rngth = torch.Generator(device=args.device)
-        # self.rngth.manual_seed(self.config.run.seed)
-        # augments = []
-        # if self.config.run.augmentations.flip:
-        #     augments.append(augmentations.Flip(self.config.run.augmentations.flip, rngth=self.rngth, seed=self.config.run.seed))
-        # if self.config.run.augmentations.bandmask:
-        #     augments.append(augmentations.BandMask(self.config.run.augmentations.bandmask, sample_rate=args.sample_rate, rng=self.rng, seed=self.config.run.seed))
-        # if self.config.run.augmentations.revecho:
-        #     augments.append(
-        #         augmentations.RevEcho(proba=self.config.run.augmentations.revecho,rng=self.rng,seed=self.config.run.seed))
-        # self.augment = torch.nn.Sequential(*augments)
-        self.log_config()
-    def unwrap_dist_model(self, model):
-        if self.use_distributed:
-            return model.module
-        else:
-            return model
-    def train_epoch(self, epoch):
-        self.model.train()
-        metric_logger = MetricLogger(delimiter="  ")
-        metric_logger.add_meter("lr", SmoothedValue(window_size=1, fmt="{value:.6f}"))
-        metric_logger.add_meter("loss", SmoothedValue(window_size=1, fmt="{value:.4f}"))
-        logging.info("Start training epoch {}, {} iters per inner epoch.".format(epoch, self.iters_per_epoch))
-        header = "Train: data epoch: [{}]".format(epoch)
-        # Get gradient clipping parameters from config
-        clip_grad_norm = self.config.run.optims.max_grad_norm
-        clip_grad_value = self.config.run.optims.max_grad_value
-        for i in metric_logger.log_every(
-            range(self.iters_per_epoch),
-            self.config.run.log_freq,
-            header=header,
-            logger=self.log_writter,
-            start_step=epoch * self.iters_per_epoch,
-        ):
-            if i >= self.iters_per_epoch:
-                break
-            samples = next(self.train_loader)
-            samples = prepare_sample_dist(samples, self.device)
-            #### augmentation
-            # if False:
-            #     samples = self.augment(samples)
-            self.scheduler.step(cur_epoch=epoch, cur_step=i)
-            with torch.autocast(self.device.type, enabled=self.use_amp, dtype=torch.bfloat16):
-                loss = self.model(samples)["loss"]
-                if torch.isnan(loss):
-                    print("loss nan", samples)
-                #     continue
-            if self.use_amp and self.scaler:
-                self.scaler.scale(loss).backward()
-            else:
-                loss.backward()
-            # Apply gradient clipping
-            if clip_grad_norm is not None:
-                if self.use_amp and self.scaler:
-                    self.scaler.unscale_(self.optimizer)
-                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=clip_grad_norm)
-            if clip_grad_value is not None:
-                if self.use_amp and self.scaler:
-                    self.scaler.unscale_(self.optimizer)
-                torch.nn.utils.clip_grad_value_(self.model.parameters(), clip_value=clip_grad_value)
-            if (i + 1) % self.config.run.accum_grad_iters == 0:
-                if self.use_amp and self.scaler:
-                    self.scaler.step(self.optimizer)
-                    self.scaler.update()
-                else:
-                    self.optimizer.step()
-                self.optimizer.zero_grad()
-            metric_logger.update(loss=loss.item())
-            metric_logger.update(lr=self.optimizer.param_groups[0]["lr"])
-        metric_logger.synchronize_between_processes()
-        logging.info("Averaged stats: " + str(metric_logger.global_avg()))
-        return {k: "{:.3f}".format(meter.global_avg) for k, meter in metric_logger.meters.items()}
-    @torch.no_grad()
-    def valid_epoch(self, epoch, split, decode=True, save_json=False, decode_ratio=1.0):
-        """
-        Decode = True will lead to calculation of custom metrics which are based on text.
-        decode_ratio controls the percentage of batches which will have custom metrics computed,
-        a speed trade-off due to the cost of the 'generate' method.
-        """
-        model = self.unwrap_dist_model(self.model)
-        model.eval()
-        dataloader = getattr(self, split + "_loader", None)
-        assert dataloader is not None, f"{split}_loader does not exist."
-        metric_logger = MetricLogger(delimiter="  ")
-        header = f"Eval: data epoch: [{epoch}]"
-        results_per_task = defaultdict(list)  # Store results per task
-        overall_results = []  # Store all results for overall metrics
-        # Calculate N based on decode_ratio
-        if decode_ratio <= 0.0:
-            N = float("inf")  # Effectively never run generate
-        elif decode_ratio >= 1.0:
-            N = 1  # Run generate every batch
-        else:
-            N = max(int(1 / decode_ratio), 1)  # Ensure N is at least 1
-        batch_idx = 0
-        # Initialize overall metrics
-        overall_res = {
-            "loss": torch.tensor(0.0, device=self.device),
-            "correct": torch.tensor(0.0, device=self.device),
-            "total": torch.tensor(0.0, device=self.device),
-        }
-        # Initialize per-task metrics
-        per_task_res = defaultdict(
-            lambda: {
-                "loss": torch.tensor(0.0, device=self.device),
-                "correct": torch.tensor(0.0, device=self.device),
-                "total": torch.tensor(0.0, device=self.device),
-                "n_sample": 0,
-                "predicted_texts": [],
-                "gold_texts": [],
-            }
-        )
-        for samples in metric_logger.log_every(dataloader, self.config.run.log_freq, header=header):
-            samples = prepare_sample_dist(samples, self.device)
-            with torch.autocast(self.device.type, enabled=self.use_amp):
-                forward_result = model(samples, verbose=True)
-            # Extract batch-level loss and correct counts
-            batch_loss = forward_result.get("loss", torch.tensor(0.0, device=self.device))
-            batch_correct = forward_result.get("correct", torch.tensor(0.0, device=self.device))
-            batch_total = forward_result.get("total", torch.tensor(1.0, device=self.device))
-            batch_size = len(samples["id"])
-            # Update overall metrics with batch-level values
-            overall_res["loss"] += batch_loss.detach()
-            overall_res["correct"] += batch_correct.detach()
-            overall_res["total"] += batch_total.detach()
-            # Decide whether to run generate based on decode_ratio
-            if decode and (batch_idx % N == 0):
-                prompts = samples.get("prompt", None)
-                try:
-                    generated_texts = model.generate(samples, self.config.generate, prompts=prompts)
-                except Exception as e:
-                    print("error in generation", e)
-                    generated_texts = [None] * batch_size
-            else:
-                generated_texts = [None] * batch_size  # Placeholder if not decoding
-            # Process per-sample data for per-task metrics and result saving
-            for i in range(batch_size):
-                task = samples["task"][i]
-                # Collect per-task batch-level metrics
-                per_task_res[task]["loss"] += batch_loss.detach()
-                per_task_res[task]["correct"] += batch_correct.detach()
-                per_task_res[task]["total"] += batch_total.detach()
-                per_task_res[task]["n_sample"] += 1
-                res = {
-                    "id": samples["id"][i],
-                    "ground_truth": samples["text"][i],  # Gold label from dataloader
-                    "task": task,
-                    "predicted_text": generated_texts[i],
-                }
-                if decode and generated_texts[i] is not None:
-                    res["prompt"] = samples.get("prompt", [None])[i]
-                results_per_task[task].append(res)
-                overall_results.append(res)
-                # Collect texts for custom metrics
-                if generated_texts[i] is not None:
-                    per_task_res[task]["predicted_texts"].append(generated_texts[i])
-                    per_task_res[task]["gold_texts"].append(samples["text"][i])
-            batch_idx += 1  # Increment batch index
-        if save_json:
-            for task, task_results in results_per_task.items():
-                self.save_result(task_results, self.output_dir, f"eval_{split}_{task}_epoch_{epoch}")
-            # Optionally save overall results
-            self.save_result(overall_results, self.output_dir, f"eval_{split}_epoch_{epoch}")
-        # Synchronize metrics across processes if in distributed mode
-        if is_dist_avail_and_initialized():
-            for key in overall_res:
-                dist.all_reduce(overall_res[key])
-        overall_ret = {
-            "loss": (overall_res["loss"] / batch_idx).item(),
-            "agg_metrics": (overall_res["correct"] / overall_res["total"]).item(),
-        }
-        if is_main_process():
-            # Log overall metrics
-            wandb.log(
-                {
-                    f"{split}_loss": overall_ret["loss"],
-                    f"{split}_accuracy": overall_ret["agg_metrics"],
-                    "epoch": epoch,
-                }
-            )
-        # Compute and log per-task metrics
-        for task, res in per_task_res.items():
-            if "caption-none" in task:
-                continue
-            if self.use_distributed:
-                print(f"Rank {dist.get_rank()}, task={task}, ")
-            print(
-                f"loss={res['loss'].shape, res['loss'].dtype}, "
-                f"correct={res['correct'].shape, res['correct'].dtype}, "
-                f"total={res['total'].shape, res['total'].dtype}, "
-                f"n_sample={res['n_sample']}"
-            )
-            # Synchronize metrics across processes if in distributed mode
-            if is_dist_avail_and_initialized():
-                dist.all_reduce(res["loss"])
-                dist.all_reduce(res["correct"])
-                dist.all_reduce(res["total"])
-                dist.all_reduce(torch.tensor(res["n_sample"], device=self.device))
-            ret = {
-                "loss": (res["loss"] / res["n_sample"]).item(),
-                "agg_metrics": (res["correct"] / res["total"]).item(),
-            }
-            if is_main_process():
-                # Log per-task metrics
-                wandb.log(
-                    {
-                        f"{split}_{task}_loss": ret["loss"],
-                        f"{split}_{task}_accuracy": ret["agg_metrics"],
-                        "epoch": epoch,
-                    }
-                )
-                # Get and compute custom metrics for this task
-                metrics_list = get_task_metrics(task)
-                predicted_texts = res["predicted_texts"]
-                gold_texts = res["gold_texts"]
-                for metric in metrics_list:
-                    if predicted_texts and gold_texts:
-                        metric_value = metric.compute_metric(predicted_texts, gold_texts)
-                        metric_name = metric.__class__.__name__
-                        wandb.log(
-                            {
-                                f"{split}_{task}_{metric_name}": metric_value,
-                                "epoch": epoch,
-                            }
-                        )
-        return overall_ret  # Return overall metrics
-    def save_result(self, result, result_dir, filename):
-        result_file = os.path.join(result_dir, "%s_rank%d.json" % (filename, get_rank()))
-        final_result_file = os.path.join(result_dir, "%s.json" % filename)
-        try:
-            json.dump(result, open(result_file, "w"), ensure_ascii=False)
-        except Exception as e:
-            logging.warning(f"Error saving {result_file}. Error: {e}")
-            json.dump(result, open(result_file, "w", encoding="utf-8"), ensure_ascii=False)
-        # if is_dist_avail_and_initialized():
-        #     dist.barrier()
-        if is_main_process():
-            logging.info("rank %d starts merging results." % get_rank())
-            result = []
-            for rank in range(get_world_size()):
-                result_file = os.path.join(result_dir, "%s_rank%d.json" % (filename, rank))
-                try:
-                    res = json.load(open(result_file, "r"))
-                except Exception as e:
-                    logging.warning(f"Error reading {result_file}. Error: {e}")
-                    res = json.load(open(result_file, "r", encoding="utf-8"))
-                result += res
-            try:
-                json.dump(result, open(final_result_file, "w"), ensure_ascii=False)
-            except Exception as e:
-                logging.warning(f"Error saving {final_result_file}. Error: {e}")
-                json.dump(
-                    result,
-                    open(final_result_file, "w", encoding="utf-8"),
-                    ensure_ascii=False,
-                )
-            print("result file saved to %s" % final_result_file)
-    def train(self):
-        start_time = time.time()
-        best_agg_metric = 0
-        best_epoch = 0
-        for cur_epoch in range(self.start_epoch, self.max_epoch):
-            if self.evaluate_only:
-                break
-            # training phase
-            logging.info("Training Phase")
-            train_stats = self.train_epoch(cur_epoch)
-            self.log_stats(train_stats, split_name="train")
-            # validating phase
-            logging.info("Validating Phase")
-            valid_log = self.valid_epoch(
-                cur_epoch,
-                "valid",
-                decode=self.config.run.custom_metrics,
-                save_json=False,
-                decode_ratio=self.config.run.decode_ratio,
-            )
-            if valid_log is not None:
-                if is_main_process():
-                    agg_metrics = valid_log["agg_metrics"]
-                    if agg_metrics > best_agg_metric:
-                        best_agg_metric = agg_metrics
-                        best_epoch = cur_epoch
-                        self.save_checkpoint(cur_epoch, is_best=True)
-                    valid_log.update({"best_epoch": best_epoch})
-                    self.log_stats(valid_log, split_name="valid")
-            self.save_checkpoint(cur_epoch, is_best=False)
-            # if self.use_distributed:
-            #     dist.barrier()
-        # testing phase
-        if self.evaluate_only:
-            self.valid_epoch("best", "test", decode=True, save_json=True)
-        total_time = time.time() - start_time
-        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
-        logging.info("Training time {}".format(total_time_str))
-    @main_process
-    def log_config(self):
-        with open(os.path.join(self.output_dir, "log.txt"), "a") as f:
-            f.write(json.dumps(self.config.model_dump_json(), indent=4) + "\n")
-    @main_process
-    def log_stats(self, stats, split_name):
-        if isinstance(stats, dict):
-            log_stats = {**{f"{split_name}_{k}": v for k, v in stats.items()}}
-            with open(os.path.join(self.output_dir, "log.txt"), "a") as f:
-                f.write(json.dumps(log_stats) + "\n")
-        elif isinstance(stats, list):
-            pass
-    @main_process
-    def save_checkpoint(self, cur_epoch, is_best=False):
-        """
-        Save the checkpoint at the current epoch.
-        """
-        model_no_ddp = self.unwrap_dist_model(self.model)
-        param_grad_dic = {k: v.requires_grad for (k, v) in model_no_ddp.named_parameters()}
-        state_dict = model_no_ddp.state_dict()
-        for k in list(state_dict.keys()):
-            if k in param_grad_dic.keys() and not param_grad_dic[k]:
-                # delete parameters that do not require gradient
-                del state_dict[k]
-        save_obj = {
-            "model": state_dict,
-            "optimizer": self.optimizer.state_dict(),
-            "config": dict(self.config),
-            "scaler": self.scaler.state_dict() if self.scaler else None,
-            "epoch": cur_epoch,
-        }
-        save_to = os.path.join(
-            self.output_dir,
-            "checkpoint_{}.pth".format("best" if is_best else cur_epoch),
-        )
-        logging.info("Saving checkpoint at epoch {} to {}.".format(cur_epoch, save_to))
-        torch.save(save_obj, save_to)

NatureLM/storage_utils.py DELETED Viewed

@@ -1,26 +0,0 @@
-import logging
-import os
-from functools import lru_cache
-from typing import Union
-import cloudpathlib
-from google.cloud.storage.client import Client
-logger = logging.getLogger(__name__)
-def is_gcs_path(path: Union[str, os.PathLike]) -> bool:
-    return str(path).startswith("gs://")
-@lru_cache(maxsize=1)
-def _get_client():
-    return cloudpathlib.GSClient(storage_client=Client())
-try:
-    _gcp_storage_client = _get_client()
-except Exception:
-    logger.warning("Failed to initialize GCS client." "Training wont be able to use GSPath or R2Path without a client.")
-    _gcp_storage_client = None

NatureLM/task_metric_utils.py DELETED Viewed

@@ -1,283 +0,0 @@
-# Taken from DCASE 2021 Task 5 evaluation source code
-# https://github.com/c4dm/dcase-few-shot-bioacoustic
-# MIT License
-import mir_eval
-import numpy as np
-import scipy
-def fast_intersect(ref, est):
-    """Find all intersections between reference events and estimated events (fast).
-    Best-case complexity: O(N log N + M log M) where N=length(ref) and M=length(est)
-    Parameters
-    ----------
-    ref: np.ndarray [shape=(2, n)], real-valued
-         Array of reference events. Each column is an event.
-         The first row denotes onset times and the second row denotes offset times.
-    est: np.ndarray [shape=(2, m)], real-valued
-         Array of estimated events. Each column is an event.
-         The first row denotes onset times and the second row denotes offset times.
-    Returns
-    -------
-    matches: list of sets, length n, integer-valued
-         Property: matches[i] contains the set of all indices j such that
-            (ref[0, i]<=est[1, j]) AND (ref[1, i]>=est[0, j])
-    """
-    ref_on_argsort = np.argsort(ref[0, :])
-    ref_off_argsort = np.argsort(ref[1, :])
-    est_on_argsort = np.argsort(est[0, :])
-    est_off_argsort = np.argsort(est[1, :])
-    est_on_maxindex = est.shape[1]
-    est_off_minindex = 0
-    estref_matches = [set()] * ref.shape[1]
-    refest_matches = [set()] * ref.shape[1]
-    for ref_id in range(ref.shape[1]):
-        ref_onset = ref[0, ref_on_argsort[ref_id]]
-        est_off_sorted = est[1, est_off_argsort[est_off_minindex:]]
-        search_result = np.searchsorted(est_off_sorted, ref_onset, side="left")
-        est_off_minindex += search_result
-        refest_match = est_off_argsort[est_off_minindex:]
-        refest_matches[ref_on_argsort[ref_id]] = set(refest_match)
-        ref_offset = ref[1, ref_off_argsort[-1 - ref_id]]
-        est_on_sorted = est[0, est_on_argsort[: (1 + est_on_maxindex)]]
-        search_result = np.searchsorted(est_on_sorted, ref_offset, side="right")
-        est_on_maxindex = search_result - 1
-        estref_match = est_on_argsort[: (1 + est_on_maxindex)]
-        estref_matches[ref_off_argsort[-1 - ref_id]] = set(estref_match)
-    zip_iterator = zip(refest_matches, estref_matches)
-    matches = [x.intersection(y) for (x, y) in zip_iterator]
-    return matches
-def iou(ref, est, method="fast"):
-    """Compute pairwise "intersection over union" (IOU) metric between reference
-    events and estimated events.
-    Let us denote by a_i and b_i the onset and offset of reference event i.
-    Let us denote by u_j and v_j the onset and offset of estimated event j.
-    The IOU between events i and j is defined as
-        (min(b_i, v_j)-max(a_i, u_j)) / (max(b_i, v_j)-min(a_i, u_j))
-    if the events are non-disjoint, and equal to zero otherwise.
-    Parameters
-    ----------
-    ref: np.ndarray [shape=(2, n)], real-valued
-         Array of reference events. Each column is an event.
-         The first row denotes onset times and the second row denotes offset times.
-    est: np.ndarray [shape=(2, m)], real-valued
-         Array of estimated events. Each column is an event.
-         The first row denotes onset times and the second row denotes offset times.
-    method: str, optional.
-         If "fast" (default), computes pairwise intersections via a custom
-         dynamic programming algorithm, see fast_intersect.
-         If "slow", computes pairwise intersections via bruteforce quadratic
-         search, see slow_intersect.
-    Returns
-    -------
-    S: scipy.sparse.dok.dok_matrix, real-valued
-        Sparse 2-D matrix. S[i,j] contains the IOU between ref[i] and est[j]
-        if these events are non-disjoint and zero otherwise.
-    """
-    n_refs = ref.shape[1]
-    n_ests = est.shape[1]
-    S = scipy.sparse.dok_matrix((n_refs, n_ests))
-    if method == "fast":
-        matches = fast_intersect(ref, est)
-    elif method == "slow":
-        matches = slow_intersect(ref, est)
-    for ref_id in range(n_refs):
-        matching_ests = matches[ref_id]
-        ref_on = ref[0, ref_id]
-        ref_off = ref[1, ref_id]
-        for matching_est_id in matching_ests:
-            est_on = est[0, matching_est_id]
-            est_off = est[1, matching_est_id]
-            intersection = min(ref_off, est_off) - max(ref_on, est_on)
-            union = max(ref_off, est_off) - min(ref_on, est_on)
-            intersection_over_union = intersection / union
-            S[ref_id, matching_est_id] = intersection_over_union
-    return S
-def compute_intersection(ref, est, method="fast"):
-    """Compute pairwise intersection between reference
-    events and estimated events.
-    Let us denote by a_i and b_i the onset and offset of reference event i.
-    Let us denote by u_j and v_j the onset and offset of estimated event j.
-    The Intersection between events i and j is defined as
-        (min(b_i, v_j)-max(a_i, u_j))
-    if the events are non-disjoint, and equal to zero otherwise.
-    Parameters
-    ----------
-    ref: np.ndarray [shape=(2, n)], real-valued
-         Array of reference events. Each column is an event.
-         The first row denotes onset times and the second row denotes offset times.
-    est: np.ndarray [shape=(2, m)], real-valued
-         Array of estimated events. Each column is an event.
-         The first row denotes onset times and the second row denotes offset times.
-    method: str, optional.
-         If "fast" (default), computes pairwise intersections via a custom
-         dynamic programming algorithm, see fast_intersect.
-         If "slow", computes pairwise intersections via bruteforce quadratic
-         search, see slow_intersect.
-    Returns
-    -------
-    S: scipy.sparse.dok.dok_matrix, real-valued
-        Sparse 2-D matrix. S[i,j] contains the Intersection between ref[i] and est[j]
-        if these events are non-disjoint and zero otherwise.
-    """
-    n_refs = ref.shape[1]
-    n_ests = est.shape[1]
-    S = scipy.sparse.dok_matrix((n_refs, n_ests))
-    if method == "fast":
-        matches = fast_intersect(ref, est)
-    elif method == "slow":
-        matches = slow_intersect(ref, est)
-    for ref_id in range(n_refs):
-        matching_ests = matches[ref_id]
-        ref_on = ref[0, ref_id]
-        ref_off = ref[1, ref_id]
-        for matching_est_id in matching_ests:
-            est_on = est[0, matching_est_id]
-            est_off = est[1, matching_est_id]
-            intersection = min(ref_off, est_off) - max(ref_on, est_on)
-            # union = max(ref_off, est_off) - min(ref_on, est_on)
-            # intersection_over_union = intersection / union
-            S[ref_id, matching_est_id] = intersection #_over_union
-    return S
-def match_events(ref, est, min_iou=0.0, method="fast"):
-    """
-    Compute a maximum matching between reference and estimated event times,
-    subject to a criterion of minimum intersection-over-union (IOU).
-    Given two lists of events ``ref`` (reference) and ``est`` (estimated),
-    we seek the largest set of correspondences ``(ref[i], est[j])`` such that
-        ``iou(ref[i], est[j]) <= min_iou``
-    and such that each ``ref[i]`` and ``est[j]`` is matched at most once.
-    This function is strongly inspired by mir_eval.onset.util.match_events.
-    It relies on mir_eval's implementation of the Hopcroft-Karp algorithm from
-    maximum bipartite graph matching. However, one important difference is that
-    mir_eval's distance function relies purely on onset times, whereas this function
-    considers both onset times and offset times to compute the IOU metric between
-    reference events and estimated events.
-    Parameters
-    ----------
-    ref: np.ndarray [shape=(2, n)], real-valued
-         Array of reference events. Each column is an event.
-         The first row denotes onset times and the second row denotes offset times.
-    est: np.ndarray [shape=(2, m)], real-valued
-         Array of estimated events. Each column is an event.
-         The first row denotes onset times and the second row denotes offset times.
-    min_iou: real number in [0, 1). Default: 0.
-         Threshold for minimum amount of intersection over union (IOU) to match
-         any two events. See the iou method for implementation details.
-    method: str, optional.
-         If "fast" (default), computes pairwise intersections via a custom
-         dynamic programming algorithm, see fast_intersect.
-         If "slow", computes pairwise intersections via bruteforce quadratic
-         search, see slow_intersect.
-    Returns
-    -------
-    matching : list of tuples
-        Every tuple corresponds to a match between one reference event and
-        one estimated event.
-            ``matching[i] == (i, j)`` where ``ref[i]`` matches ``est[j]``.
-        Note that all values i and j appear at most once in the list.
-    """
-    # Intersect reference events and estimated events
-    S = iou(ref, est, method=method)
-    # Threshold intersection-over-union (IOU) ratio
-    S_bool = scipy.sparse.dok_matrix(S > min_iou)
-    hits = S_bool.keys()
-    # Construct the bipartite graph
-    G = {}
-    for ref_i, est_i in hits:
-        if est_i not in G:
-            G[est_i] = []
-        G[est_i].append(ref_i)
-    # Apply Hopcroft-Karp algorithm (from mir_eval package)
-    # to obtain maximum bipartite graph matching
-    matching = sorted(mir_eval.util._bipartite_match(G).items())
-    return matching
-def slow_intersect(ref, est):
-    """Find all intersections between reference events and estimated events (slow).
-    Best-case complexity: O(N*M) where N=ref.shape[1] and M=est.shape[1]
-    Parameters
-    ----------
-    ref: np.ndarray [shape=(2, n)], real-valued
-         Array of reference events. Each column is an event.
-         The first row denotes onset times and the second row denotes offset times.
-    est: np.ndarray [shape=(2, m)], real-valued
-         Array of estimated events. Each column is an event.
-         The first row denotes onset times and the second row denotes offset times.
-    Returns
-    -------
-    matches: list of sets, length n, integer-valued
-         Property: matches[i] contains the set of all indices j such that
-            (ref[0, i]<=est[1, j]) AND (ref[1, i]>=est[0, j])
-    """
-    matches = []
-    for i in range(ref.shape[1]):
-        matches.append(
-            set(
-                [
-                    j
-                    for j in range(est.shape[1])
-                    if ((ref[0, i] <= est[1, j]) and (ref[1, i] >= est[0, j]))
-                ]
-            )
-        )
-    return
-def frames_to_st_dict(x, sr=16000):
-    # x : Tensor of shape (batch, time) or (time,). Entries are 2 (POS), 1 (UNK), and 0 (NEG).
-    # returns a list of dicts {"Begin Time (s)" : [...], "End Time (s)" : [...], "Annotation" : [...]} if batch dim exists, or a single dict
-    if len(x.size()) == 2:
-        outs = []
-        for i in range(x.size(0)):
-            x_sub = x[i,:]
-            outs.append(_frames_to_st_dict_single(x_sub, sr=sr))
-        return outs
-    else:
-        return _frames_to_st_dict_single(x, sr=sr)
-def _frames_to_st_dict_single(x, sr=16000):
-    d = {"Begin Time (s)" : [], "End Time (s)" : [], "Annotation" : []}
-    for label_i in [1,2]:
-        labels = x.numpy() == label_i  # POS : 2, UNK : 1, NEG : 0
-        starts = np.where((~labels[:-1]) & (labels[1:]))[0] + 1
-        if labels[0]:
-            starts = np.insert(starts, 0, 0)
-        ends = np.where((labels[:-1]) & (~labels[1:]))[0] + 1
-        if labels[-1]:
-            ends = np.append(ends, len(labels))
-        for start, end in zip(starts, ends):
-            d["Begin Time (s)"].append(start/sr)
-            d["End Time (s)"].append(end/sr)
-            d["Annotation"].append("POS" if label_i == 2 else "UNK")
-    return d

NatureLM/task_metrics.py DELETED Viewed

@@ -1,128 +0,0 @@
-import re
-from abc import ABC, abstractmethod
-from typing import List, Tuple
-import numpy as np
-from NatureLM.task_metric_utils import match_events
-# Assume the following functions are imported from the reference implementations:
-# - match_events
-# - iou
-# - fast_intersect
-# - slow_intersect
-# - compute_intersection
-class Metric(ABC):
-    @abstractmethod
-    def compute_metric(self, predicted_texts: List[str], gold_texts: List[str]) -> float:
-        pass
-class ExactAccuracy(Metric):
-    """Exact-match accuracy metric."""
-    def compute_metric(self, predicted_texts: List[str], gold_texts: List[str]) -> float:
-        predicted_texts = [pt.lower().strip() for pt in predicted_texts]
-        gold_texts = [gt.lower().strip() for gt in gold_texts]
-        correct = sum(p == g for p, g in zip(predicted_texts, gold_texts))
-        return correct / len(gold_texts) if gold_texts else 0.0
-class FewShot(Metric):
-    """Few-shot learning metric based on event matching using IoU."""
-    def compute_metric(self, predicted_texts: List[str], gold_texts: List[str]) -> float:
-        # Initialize counts
-        total_TP = 0
-        total_FP = 0
-        total_FN = 0
-        for pred_text, gold_text in zip(predicted_texts, gold_texts):
-            # Extract events from texts
-            pred_events = parse_timestamps_from_text(pred_text)
-            gold_events = parse_timestamps_from_text(gold_text)
-            # Convert events to numpy arrays for match_events function
-            # Each event is (start_time, end_time), need to transpose to shape (2, n)
-            pred_array = np.array(pred_events).T if pred_events else np.empty((2, 0))
-            gold_array = np.array(gold_events).T if gold_events else np.empty((2, 0))
-            # Use match_events function from the reference implementation
-            matches = match_events(gold_array, pred_array, min_iou=0.5, method="fast")
-            TP = len(matches)
-            FP = len(pred_events) - TP
-            FN = len(gold_events) - TP
-            total_TP += TP
-            total_FP += FP
-            total_FN += FN
-        # Compute precision, recall, and F1 score
-        precision = total_TP / (total_TP + total_FP) if (total_TP + total_FP) > 0 else 0.0
-        recall = total_TP / (total_TP + total_FN) if (total_TP + total_FN) > 0 else 0.0
-        f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
-        return f1_score
-class NoneAccuracy(Metric):
-    """Accuracy for cases where 'None' is the correct answer."""
-    def compute_metric(self, predicted_texts: List[str], gold_texts: List[str]) -> float:
-        # Normalize texts
-        predicted_texts = [pt.lower().strip() for pt in predicted_texts]
-        gold_texts = [gt.lower().strip() for gt in gold_texts]
-        # Filter indices where gold_text is 'none'
-        indices = [i for i, gt in enumerate(gold_texts) if gt == "none"]
-        if not indices:
-            return 0.0  # No 'None' cases in gold_texts
-        correct = sum(predicted_texts[i] == "none" for i in indices)
-        return correct / len(indices)
-class MultipleSpeciesAccuracy(Metric):
-    """Accuracy for cases where the correct answer has at least one comma (multiple species)."""
-    def compute_metric(self, predicted_texts: List[str], gold_texts: List[str]) -> float:
-        # Normalize texts
-        predicted_texts = [pt.lower().strip() for pt in predicted_texts]
-        gold_texts = [gt.lower().strip() for gt in gold_texts]
-        # Filter indices where gold_text contains at least one comma
-        indices = [i for i, gt in enumerate(gold_texts) if "," in gt]
-        if not indices:
-            return 0.0  # No multiple-species cases in gold_texts
-        correct = sum(predicted_texts[i] == gold_texts[i] for i in indices)
-        return correct / len(indices)
-def get_task_metrics(task: str) -> List[Metric]:
-    """Get a list of metric instances appropriate for the given task."""
-    all_metrics = []
-    metrics_dict = {}
-    if "classification" in task:
-        metrics_dict["ExactAccuracy"] = ExactAccuracy()
-    if "fewshot" in task:
-        metrics_dict["FewShot"] = FewShot()
-    if "detection" in task:
-        metrics_dict["ExactAccuracy"] = ExactAccuracy()  # Ensures no duplicate
-        metrics_dict["NoneAccuracy"] = NoneAccuracy()
-        metrics_dict["MultipleSpeciesAccuracy"] = MultipleSpeciesAccuracy()
-    all_metrics = list(metrics_dict.values())
-    return all_metrics
-def parse_timestamps_from_text(text: str) -> List[Tuple[float, float]]:
-    """
-    Function to parse timestamps from text.
-    Extracts timestamps in the format "start-end" where start and end are floats.
-    """
-    # Regular expression to extract timestamps in the format "start-end"
-    pattern = r"(\d+\.\d+)-(\d+\.\d+)"
-    matches = re.findall(pattern, text)
-    events = [(float(start), float(end)) for start, end in matches]
-    return events

NatureLM/utils.py CHANGED Viewed

@@ -25,9 +25,7 @@ import soundfile as sf
 import torch
 import torch.nn.functional as F
 import torchaudio
-from torch.utils.data import DataLoader, DistributedSampler
-from NatureLM.dist_utils import get_rank, get_world_size
 logger = logging.getLogger(__name__)
@@ -99,29 +97,6 @@ def now_as_str() -> str:
     return datetime.now().strftime("%Y%m%d%H%M")
-def get_dataloader(dataset, config, is_train=True, use_distributed=True):
-    if use_distributed:
-        sampler = DistributedSampler(dataset, shuffle=is_train, num_replicas=get_world_size(), rank=get_rank())
-    else:
-        sampler = None
-    loader = DataLoader(
-        dataset,
-        batch_size=config.batch_size_train if is_train else config.batch_size_eval,
-        num_workers=config.num_workers,
-        pin_memory=False,
-        sampler=sampler,
-        shuffle=sampler is None and is_train,
-        collate_fn=dataset.collater,
-        drop_last=is_train,
-    )
-    if is_train:
-        loader = IterLoader(loader, use_distributed=use_distributed)
-    return loader
 def apply_to_sample(f, sample):
     if len(sample) == 0:
         return {}

 import torch
 import torch.nn.functional as F
 import torchaudio
+from torch.utils.data import DataLoader
 logger = logging.getLogger(__name__)
     return datetime.now().strftime("%Y%m%d%H%M")
 def apply_to_sample(f, sample):
     if len(sample) == 0:
         return {}

Space.yaml CHANGED Viewed

@@ -1,3 +1,3 @@
 sdk: gradio
 python_version: 3.10
-hardware: cpu

 sdk: gradio
 python_version: 3.10
+hardware: gpu

app.py CHANGED Viewed

@@ -1,37 +1,86 @@
-import re
-import tempfile
-from collections import Counter
 from pathlib import Path
-from typing import Literal, Optional
 import gradio as gr
 import torch
 from NatureLM.config import Config
 from NatureLM.models.NatureLM import NatureLM
-from NatureLM.utils import generate_sample_batches, prepare_sample_waveforms
 import spaces
 class ModelManager:
     """Manages model loading and state"""
     def __init__(self):
         self.model: Optional[NatureLM] = None
         self.config: Optional[Config] = None
         self.is_loaded = False
         self.is_loading = False
         self.load_failed = False
     def check_availability(self) -> tuple[bool, str]:
         """Check if the model is available for download"""
         try:
             from huggingface_hub import model_info
             info = model_info("EarthSpeciesProject/NatureLM-audio")
             return True, "Model is available"
         except Exception as e:
             return False, f"Model not available: {str(e)}"
     def reset_state(self):
         """Reset the model loading state to allow retrying after a failure"""
         self.model = None
@@ -39,7 +88,7 @@ class ModelManager:
         self.is_loading = False
         self.load_failed = False
         return self.get_status()
     def get_status(self) -> str:
         """Get the current model loading status"""
         if self.is_loaded:
@@ -50,34 +99,35 @@ class ModelManager:
             return "❌ Model failed to load. Please check the configuration."
         else:
             return "⏳ Ready to load model on first use"
     def load_model(self) -> Optional[NatureLM]:
         """Load the model if needed"""
         if self.is_loaded:
             return self.model
         if self.is_loading or self.load_failed:
             return None
         try:
             self.is_loading = True
             print("Loading model...")
             # Check if model is available first
             available, message = self.check_availability()
             if not available:
                 raise Exception(f"Model not available: {message}")
             model = NatureLM.from_pretrained("EarthSpeciesProject/NatureLM-audio")
-            model.to("cuda")
             model.eval()
-            self.model = model
             self.is_loaded = True
             self.is_loading = False
             print("Model loaded successfully!")
-            return model
         except Exception as e:
             print(f"Error loading model: {e}")
             self.is_loading = False
@@ -88,12 +138,44 @@ class ModelManager:
 # Global model manager instance
 model_manager = ModelManager()
 @spaces.GPU
-def prompt_lm(audios: list[str], messages: list[dict[str, str]]) -> str:
-    """Generate response using the model"""
     model = model_manager.load_model()
     if model is None:
         if model_manager.is_loading:
             return "🔄 Loading model... This may take a few minutes on first use. Please try again in a moment."
@@ -101,284 +183,63 @@ def prompt_lm(audios: list[str], messages: list[dict[str, str]]) -> str:
             return "❌ Model failed to load. This could be due to:\n• No internet connection\n• Insufficient disk space\n• Model repository access issues\n\nPlease check your connection and try again using the retry button."
         else:
             return "Demo mode: Model not loaded. Please check the model configuration."
-    cuda_enabled = torch.cuda.is_available()
-    samples = prepare_sample_waveforms(audios, cuda_enabled)
-    prompt_text = model.llama_tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    ).removeprefix(model.llama_tokenizer.bos_token)
-    prompt_text = re.sub(
-        r"<\|start_header_id\|>system<\|end_header_id\|>\n\nCutting Knowledge Date: [^\n]+\nToday Date: [^\n]+\n\n<\|eot_id\|>",
-        "",
-        prompt_text,
-    )
-    prompt_text = re.sub("\\n", r"\\n", prompt_text)
-    print(f"{prompt_text=}")
-    with torch.cuda.amp.autocast(dtype=torch.float16):
-        llm_answer = model.generate(samples, model_manager.config.generate, prompts=[prompt_text])
-    return llm_answer[0]
-def _multimodal_textbox_factory():
-    return gr.MultimodalTextbox(
-        value=None,
-        interactive=True,
-        sources="microphone",
-        placeholder="Enter message...",
-        show_label=False,
-        autofocus=True,
-        submit_btn="Send"
     )
 def user_message(content):
     return {"role": "user", "content": content}
-def add_message(history, message):
-    for x in message["files"]:
-        history.append(user_message({"path": x}))
-    if message["text"]:
-        history.append(user_message(message["text"]))
-    return history, _multimodal_textbox_factory()
-def combine_model_inputs(msgs: list[dict[str, str]]) -> dict[str, list[str]]:
-    messages = []
-    files = []
-    for msg in msgs:
-        print(msg, messages, files)
-        match msg:
-            case {"content": (path,)}:
-                messages.append({"role": msg["role"], "content": "<Audio><AudioHere></Audio> "})
-                files.append(path)
-            case _:
-                messages.append(msg)
-    # Join consecutive messages from the same role
-    joined_messages = []
-    for msg in messages:
-        if joined_messages and joined_messages[-1]["role"] == msg["role"]:
-            joined_messages[-1]["content"] += msg["content"]
-        else:
-            joined_messages.append(msg)
-    return {"messages": joined_messages, "files": files}
-def bot_response(history: list):
-    print(type(history))
-    combined_inputs = combine_model_inputs(history)
-    response = prompt_lm(combined_inputs["files"], combined_inputs["messages"])
-    history.append({"role": "assistant", "content": response})
-    return history
-def _chat_tab(examples):
-    # Status indicator
-    status_text = gr.Textbox(
-        value=model_manager.get_status(),
-        label="Model Status",
-        interactive=False,
-        visible=True
     )
-    chatbot = gr.Chatbot(
-        label="Chat",
-        elem_id="chatbot",
-        bubble_full_width=False,
-        type="messages",
-        render_markdown=False,
-        resizeable=True
-    )
-    chat_input = _multimodal_textbox_factory()
-    send_all = gr.Button("Send all", elem_id="send-all")
-    clear_button = gr.ClearButton(components=[chatbot, chat_input], visible=False)
-    chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
-    bot_msg = send_all.click(
-        bot_response,
-        [chatbot],
-        [chatbot],
-        api_name="bot_response",
-    )
-    # Update status after bot response
-    bot_msg.then(lambda: model_manager.get_status(), None, [status_text])
-    bot_msg.then(lambda: gr.ClearButton(visible=True), None, [clear_button])
-    clear_button.click(lambda: gr.ClearButton(visible=False), None, [clear_button])
-    gr.Examples(
-        list(examples.values()),
-        chatbot,
-        chatbot,
-        example_labels=list(examples.keys()),
-        examples_per_page=20,
-    )
-def summarize_batch_results(results):
-    summary = Counter(results)
-    summary_str = "\n".join(f"{k}: {v}" for k, v in summary.most_common())
-    return summary_str
-def run_batch_inference(files, task, progress=gr.Progress()) -> str:
-    model = model_manager.load_model()
-    if model is None:
-        if model_manager.is_loading:
-            return "🔄 Loading model... This may take a few minutes on first use. Please try again in a moment."
-        elif model_manager.load_failed:
-            return "❌ Model failed to load. This could be due to:\n• No internet connection\n• Insufficient disk space\n• Model repository access issues\n\nPlease check your connection and try again."
-        else:
-            return "Demo mode: Model not loaded. Please check the model configuration."
-    outputs = []
-    prompt = "<Audio><AudioHere></Audio> " + task
-    for file in progress.tqdm(files):
-        outputs.append(prompt_lm([file], [{"role": "user", "content": prompt}]))
-    batch_summary: str = summarize_batch_results(outputs)
-    report = f"Batch summary:\n{batch_summary}\n\n"
-    return report
-def multi_extension_glob_mask(mask_base, *extensions):
-    mask_ext = ["[{}]".format("".join(set(c))) for c in zip(*extensions)]
-    if not mask_ext or len(set(len(e) for e in extensions)) > 1:
-        mask_ext.append("*")
-    return mask_base + "".join(mask_ext)
-def _batch_tab(file_selection: Literal["upload", "explorer"] = "upload"):
-    if file_selection == "explorer":
-        files = gr.FileExplorer(
-            glob=multi_extension_glob_mask("**.", "mp3", "flac", "wav"),
-            label="Select audio files",
-            file_count="multiple",
         )
-    elif file_selection == "upload":
-        files = gr.Files(label="Uploaded files", file_types=["audio"], height=300)
-    task = gr.Textbox(label="Task", placeholder="Enter task...", show_label=True)
-    process_btn = gr.Button("Process")
-    output = gr.TextArea()
-    process_btn.click(
-        run_batch_inference,
-        [files, task],
-        [output],
-    )
-def to_raven_format(outputs: dict[int, str], chunk_len: int = 10) -> str:
-    def get_line(row, start, end, annotation):
-        return f"{row}\tSpectrogram 1\t1\t{start}\t{end}\t0\t8000\t{annotation}"
-    raven_output = ["Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tAnnotation"]
-    current_offset = 0
-    last_label = ""
-    row = 1
-    for offset, label in sorted(outputs.items()):
-        if label != last_label and last_label:
-            raven_output.append(get_line(row, current_offset, offset, last_label))
-            current_offset = offset
-            row += 1
-        if not last_label:
-            current_offset = offset
-        if label != "None":
-            last_label = label
         else:
-            last_label = ""
-    if last_label:
-        raven_output.append(get_line(row, current_offset, current_offset + chunk_len, last_label))
-    return "\n".join(raven_output)
-def _run_long_recording_inference(file, task, chunk_len: int = 10, hop_len: int = 5, progress=gr.Progress()):
-    # Check if model is loading
-    if model_manager.is_loading:
-        return "🔄 Loading model... This may take a few minutes on first use. Please try again in a moment.", None
-    # Check if model failed to load
-    if model_manager.load_failed:
-        return "❌ Model failed to load. This could be due to:\n• No internet connection\n• Insufficient disk space\n• Model repository access issues\n\nPlease refresh the page to try again.", None
-    model = model_manager.load_model()
-    if model is None:
-        return "Demo mode: Model not loaded. Please check the model configuration.", None
-    cuda_enabled = torch.cuda.is_available()
-    outputs = {}
-    offset = 0
-    prompt = f"<Audio><AudioHere></Audio> {task}"
-    prompt = model_manager.config.model.prompt_template.format(prompt)
-    for batch in progress.tqdm(generate_sample_batches(file, cuda_enabled, chunk_len=chunk_len, hop_len=hop_len)):
-        prompt_strs = [prompt] * len(batch["audio_chunk_sizes"])
-        with torch.cuda.amp.autocast(dtype=torch.float16):
-            llm_answers = model.generate(batch, model_manager.config.generate, prompts=prompt_strs)
-        for answer in llm_answers:
-            outputs[offset] = answer
-            offset += hop_len
-    report = f"Number of chunks: {len(outputs)}\n\n"
-    for offset in sorted(outputs.keys()):
-        report += f"{offset:02d}s:\t{outputs[offset]}\n"
-    raven_output = to_raven_format(outputs, chunk_len=chunk_len)
-    with tempfile.NamedTemporaryFile(mode="w", prefix="raven-", suffix=".txt", delete=False) as f:
-        f.write(raven_output)
-        raven_file = f.name
-    return report, raven_file
-def _long_recording_tab():
-    audio_input = gr.Audio(label="Upload audio file", type="filepath")
-    task = gr.Dropdown(
-        [
-            "What are the common names for the species in the audio, if any?",
-            "Caption the audio.",
-            "Caption the audio, using the scientific name for any animal species.",
-            "Caption the audio, using the common name for any animal species.",
-            "What is the scientific name for the focal species in the audio?",
-            "What is the common name for the focal species in the audio?",
-            "What is the family of the focal species in the audio?",
-            "What is the genus of the focal species in the audio?",
-            "What is the taxonomic name of the focal species in the audio?",
-            "What call types are heard from the focal species in the audio?",
-            "What is the life stage of the focal species in the audio?",
-        ],
-        label="Tasks",
-        allow_custom_value=True,
-    )
-    with gr.Accordion("Advanced options", open=False):
-        hop_len = gr.Slider(1, 10, 5, label="Hop length (seconds)", step=1)
-        chunk_len = gr.Slider(1, 10, 10, label="Chunk length (seconds)", step=1)
-    process_btn = gr.Button("Process")
-    output = gr.TextArea()
-    download_raven = gr.DownloadButton("Download Raven file")
-    process_btn.click(
-        _run_long_recording_inference,
-        [audio_input, task, chunk_len, hop_len],
-        [output, download_raven],
-    )
 def main(
     assets_dir: Path,
     cfg_path: str | Path,
     options: list[str] = [],
-    device: str = "cuda",
 ):
     # Load configuration
     try:
@@ -394,7 +255,7 @@ def main(
     if not assets_dir.exists():
         print(f"Warning: Assets directory {assets_dir} does not exist")
         assets_dir.mkdir(exist_ok=True)
     # Create placeholder audio files if they don't exist
     laz_audio = assets_dir / "Lazuli_Bunting_yell-YELLLAZB20160625SM303143.mp3"
     frog_audio = assets_dir / "nri-GreenTreeFrogEvergladesNP.mp3"
@@ -411,7 +272,9 @@ def main(
         "Caption the audio (Green Tree Frog)": [
             [
                 user_message({"path": str(frog_audio)}),
-                user_message("Caption the audio, using the common name for any animal species."),
             ]
         ],
         "Caption the audio (American Robin)": [
@@ -428,17 +291,31 @@ def main(
         ],
     }
-    with gr.Blocks(title="NatureLM-audio", theme=gr.themes.Base(primary_hue="blue", font=[gr.themes.GoogleFont("Noto Sans")])) as app:
         header = gr.HTML("""
         <div style="display: flex; align-items: center; gap: 12px;"><h2 style="margin: 0;">NatureLM-audio<span style="font-size: 0.55em; color: #28a745; background: #e6f4ea; padding: 2px 6px; border-radius: 4px; margin-left: 8px; display: inline-block; vertical-align: top;">BETA</span></h2></div>
         """)
         with gr.Tabs():
             with gr.Tab("Analyze Audio"):
-                uploaded_audio = gr.State()
-                with gr.Column(visible=True) as onboarding_message:
-                    gr.HTML("""
                     <div style="
                         background: transparent;
                         border: 1px solid #e5e7eb;
@@ -476,45 +353,102 @@ def main(
                         onmouseout="this.style.background='#3b82f6';"
                         >View Tutorial</a>
                     </div>
-                    """, padding=False)
                 with gr.Column(visible=True) as upload_section:
-                    audio_input = gr.Audio(
                         type="filepath",
-                        container=True,
-                        interactive=True,
-                        sources=['upload']
-                )
                 with gr.Group(visible=False) as chat:
-                    chatbot = gr.Chatbot(
-                        elem_id="chatbot",
-                        type="messages",
                         render_markdown=False,
-                        feedback_options=["like", "dislike", "wrong species", "incorrect response", "other"],
-                        resizeable=True
                     )
-                    chat_input = _multimodal_textbox_factory()
-                    send_all = gr.Button("Send all")
                     def start_chat_interface(audio_path):
-                        return (
-                            gr.update(visible=False), # hide onboarding message
                             gr.update(visible=True),  # show upload section
-                            gr.update(visible=True),  # show chat box
                         )
                     audio_input.change(
                         fn=start_chat_interface,
                         inputs=[audio_input],
-                        outputs=[onboarding_message, upload_section, chat]
                     )
-                    chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
-                    send_all.click(bot_response, [chatbot], [chatbot])
             with gr.Tab("Sample Library"):
-                gr.Markdown("## Sample Library\n\nExplore example audio files below.")
                 gr.Examples(
                     list(examples.values()),
                     chatbot,
@@ -523,10 +457,10 @@ def main(
                     examples_per_page=20,
                 )
             with gr.Tab("💡 Help"):
-                gr.Markdown("## User Guide") # to fill out
-                gr.Markdown("## Share Feedback") # to fill out
-                gr.Markdown("## FAQs") # to fill out
             app.css = """
             .welcome-banner {
                 background: transparent !important;
@@ -550,7 +484,7 @@ def main(
                 _batch_tab()
             with gr.Tab("Long Recording"):
                 _long_recording_tab() """
     return app
@@ -559,8 +493,7 @@ app = main(
     assets_dir=Path("assets"),
     cfg_path=Path("configs/inference.yml"),
     options=[],
-    device="cuda",
 )
 if __name__ == "__main__":
-    app.launch()

+import warnings
+import numpy as np
 from pathlib import Path
+from typing import Optional
+from collections import Counter
 import gradio as gr
 import torch
+import torchaudio
+import matplotlib.pyplot as plt
 from NatureLM.config import Config
 from NatureLM.models.NatureLM import NatureLM
+from NatureLM.infer import Pipeline
 import spaces
+warnings.filterwarnings("ignore")
+SAMPLE_RATE = 16000  # Default sample rate for NatureLM-audio
+def get_spectrogram(audio: torch.Tensor) -> plt.Figure:
+    """Generate a spectrogram from the audio tensor."""
+    spectrogram = torchaudio.transforms.Spectrogram(n_fft=1024)(audio)
+    spectrogram = spectrogram.numpy()[0].squeeze()
+    # Convert to matplotlib figure with imshow
+    fig, ax = plt.subplots(figsize=(13, 5))
+    ax.imshow(np.log(spectrogram + 1e-3), aspect="auto", origin="lower", cmap="viridis")
+    ax.set_title("Spectrogram")
+    ax.set_xlabel("Time")
+    # Set x ticks to reflect 0 to audio duration seconds
+    if audio.dim() > 1:
+        duration = audio.size(1) / SAMPLE_RATE
+    else:
+        duration = audio.size(0) / SAMPLE_RATE
+    ax.set_xticks([0, spectrogram.shape[1]])
+    ax.set_xticklabels(["0s", f"{duration:.2f}s"])
+    ax.set_ylabel("Frequency")
+    # Set y ticks to reflect 0 to nyquist frequency (sample_rate/2)
+    nyquist_freq = SAMPLE_RATE / 2
+    ax.set_yticks(
+        [
+            0,
+            spectrogram.shape[0] // 4,
+            spectrogram.shape[0] // 2,
+            3 * spectrogram.shape[0] // 4,
+            spectrogram.shape[0] - 1,
+        ]
+    )
+    ax.set_yticklabels(
+        [
+            "0 Hz",
+            f"{nyquist_freq / 4:.0f} Hz",
+            f"{nyquist_freq / 2:.0f} Hz",
+            f"{3 * nyquist_freq / 4:.0f} Hz",
+            f"{nyquist_freq:.0f} Hz",
+        ]
+    )
+    fig.tight_layout()
+    return fig
 class ModelManager:
     """Manages model loading and state"""
     def __init__(self):
         self.model: Optional[NatureLM] = None
         self.config: Optional[Config] = None
         self.is_loaded = False
         self.is_loading = False
         self.load_failed = False
     def check_availability(self) -> tuple[bool, str]:
         """Check if the model is available for download"""
         try:
             from huggingface_hub import model_info
             info = model_info("EarthSpeciesProject/NatureLM-audio")
             return True, "Model is available"
         except Exception as e:
             return False, f"Model not available: {str(e)}"
     def reset_state(self):
         """Reset the model loading state to allow retrying after a failure"""
         self.model = None
         self.is_loading = False
         self.load_failed = False
         return self.get_status()
     def get_status(self) -> str:
         """Get the current model loading status"""
         if self.is_loaded:
             return "❌ Model failed to load. Please check the configuration."
         else:
             return "⏳ Ready to load model on first use"
     def load_model(self) -> Optional[NatureLM]:
         """Load the model if needed"""
         if self.is_loaded:
             return self.model
         if self.is_loading or self.load_failed:
             return None
         try:
             self.is_loading = True
             print("Loading model...")
             # Check if model is available first
             available, message = self.check_availability()
             if not available:
                 raise Exception(f"Model not available: {message}")
             model = NatureLM.from_pretrained("EarthSpeciesProject/NatureLM-audio")
+            model.to("cpu")
             model.eval()
+            pipe = Pipeline(model)
+            self.model = pipe
             self.is_loaded = True
             self.is_loading = False
             print("Model loaded successfully!")
+            return pipe
         except Exception as e:
             print(f"Error loading model: {e}")
             self.is_loading = False
 # Global model manager instance
 model_manager = ModelManager()
+def take_majority_vote(results: list[list[dict]]) -> list[str]:
+    """For each audio file, take the majority vote of the labels across all windows"""
+    outputs = []
+    for result in results:
+        predictions = [window["prediction"] for window in result]
+        if not predictions:
+            continue
+        # Count occurrences of each label
+        counts = Counter(predictions)
+        # Find the most common label
+        most_common_label, _ = counts.most_common(1)[0]
+        outputs.append(most_common_label)
+    return outputs
 @spaces.GPU
+def prompt_lm(
+    audios: list[str],
+    queries: list[str] | str,
+    window_length_seconds: float = 10.0,
+    hop_length_seconds: float = 10.0,
+    progress=gr.Progress(),
+) -> list[str]:
+    """Generate response using the model
+    Args:
+        audios (list[str]): List of audio file paths
+        queries (list[str] | str): Query or list of queries to process
+        window_length_seconds (float): Length of the window for processing audio
+        hop_length_seconds (float): Hop length for processing audio
+    Returns:
+        list[str]: List of generated responses for each audio-query pair
+    """
     model = model_manager.load_model()
     if model is None:
         if model_manager.is_loading:
             return "🔄 Loading model... This may take a few minutes on first use. Please try again in a moment."
             return "❌ Model failed to load. This could be due to:\n• No internet connection\n• Insufficient disk space\n• Model repository access issues\n\nPlease check your connection and try again using the retry button."
         else:
             return "Demo mode: Model not loaded. Please check the model configuration."
+    results: list[list[dict]] = model(
+        audios,
+        queries,
+        window_length_seconds=window_length_seconds,
+        hop_length_seconds=hop_length_seconds,
+        input_sample_rate=None,
+        progress_bar=progress,
     )
+    return results
 def user_message(content):
     return {"role": "user", "content": content}
+def add_message_and_get_response(
+    chatbot_history: list[dict], audio_input: str, chat_input: str
+) -> tuple[list[dict], str]:
+    """Add user message to chat and get model response"""
+    # Load audio with torchaudio and compute spectrogram
+    audio_tensor, sample_rate = torchaudio.load(audio_input)
+    duration = audio_tensor.size(1) / sample_rate
+    spectrogram_fig = get_spectrogram(audio_tensor)
+    # Add gr.Plot to chatbot history
+    chatbot_history.append(
+        {"role": "user", "content": gr.Plot(spectrogram_fig, label="Spectrogram")}
     )
+    # Get response
+    try:
+        response = prompt_lm(
+            audios=[audio_input],
+            queries=[chat_input],
+            window_length_seconds=duration,
+            hop_length_seconds=duration,
         )
+        # get first item
+        if isinstance(response, list) and len(response) > 0:
+            response = response[0][0]["prediction"]
         else:
+            response = "No response generated."
+    except Exception as e:
+        print(f"Error generating response: {e}")
+        response = "Error generating response. Please try again."
+    # Add user message to chat history
+    chatbot_history.append({"role": "user", "content": "Q: " + chat_input})
+    # Add model response to chat history
+    chatbot_history.append({"role": "assistant", "content": response})
+    return chatbot_history, ""
 def main(
     assets_dir: Path,
     cfg_path: str | Path,
     options: list[str] = [],
 ):
     # Load configuration
     try:
     if not assets_dir.exists():
         print(f"Warning: Assets directory {assets_dir} does not exist")
         assets_dir.mkdir(exist_ok=True)
     # Create placeholder audio files if they don't exist
     laz_audio = assets_dir / "Lazuli_Bunting_yell-YELLLAZB20160625SM303143.mp3"
     frog_audio = assets_dir / "nri-GreenTreeFrogEvergladesNP.mp3"
         "Caption the audio (Green Tree Frog)": [
             [
                 user_message({"path": str(frog_audio)}),
+                user_message(
+                    "Caption the audio, using the common name for any animal species."
+                ),
             ]
         ],
         "Caption the audio (American Robin)": [
         ],
     }
+    with gr.Blocks(
+        title="NatureLM-audio",
+        theme=gr.themes.Base(
+            primary_hue="blue", font=[gr.themes.GoogleFont("Noto Sans")]
+        ),
+    ) as app:
         header = gr.HTML("""
         <div style="display: flex; align-items: center; gap: 12px;"><h2 style="margin: 0;">NatureLM-audio<span style="font-size: 0.55em; color: #28a745; background: #e6f4ea; padding: 2px 6px; border-radius: 4px; margin-left: 8px; display: inline-block; vertical-align: top;">BETA</span></h2></div>
         """)
         with gr.Tabs():
             with gr.Tab("Analyze Audio"):
+                uploaded_audio = gr.State()
+                # Status indicator
+                # status_text = gr.Textbox(
+                #     value=model_manager.get_status(),
+                #     label="Model Status",
+                #     interactive=False,
+                #     visible=True,
+                # )
+                with gr.Column(visible=True) as onboarding_message:
+                    gr.HTML(
+                        """
                     <div style="
                         background: transparent;
                         border: 1px solid #e5e7eb;
                         onmouseout="this.style.background='#3b82f6';"
                         >View Tutorial</a>
                     </div>
+                    """,
+                        padding=False,
+                    )
                 with gr.Column(visible=True) as upload_section:
+                    audio_input = gr.Audio(
                         type="filepath",
+                        container=True,
+                        interactive=True,
+                        sources=["upload"],
+                    )
                 with gr.Group(visible=False) as chat:
+                    chatbot = gr.Chatbot(
+                        elem_id="chatbot",
+                        type="messages",
+                        label="Chat",
                         render_markdown=False,
+                        feedback_options=[
+                            "like",
+                            "dislike",
+                            "wrong species",
+                            "incorrect response",
+                            "other",
+                        ],
+                        resizeable=True,
+                    )
+                    gr.Markdown("### Your Query")
+                    task_dropdown = gr.Dropdown(
+                        [
+                            "What are the common names for the species in the audio, if any?",
+                            "Caption the audio.",
+                            "Caption the audio, using the scientific name for any animal species.",
+                            "Caption the audio, using the common name for any animal species.",
+                            "What is the scientific name for the focal species in the audio?",
+                            "What is the common name for the focal species in the audio?",
+                            "What is the family of the focal species in the audio?",
+                            "What is the genus of the focal species in the audio?",
+                            "What is the taxonomic name of the focal species in the audio?",
+                            "What call types are heard from the focal species in the audio?",
+                            "What is the life stage of the focal species in the audio?",
+                        ],
+                        label="Pre-configured Tasks",
+                        allow_custom_value=True,
+                        info="Select a task or enter a custom query below",
+                    )
+                    chat_input = gr.Textbox(
+                        placeholder="e.g. 'Caption this audio'...",
+                        type="text",
+                        label="Query",
+                        lines=2,
+                        show_label=True,
+                        container=False,
+                        submit_btn="Send",
+                        elem_id="chat-input",
+                    )
+                    # if task_dropdown is selected, set chat_input to that value
+                    def set_query(task):
+                        if task:
+                            return gr.update(value=task)
+                        return gr.update(value="")
+                    task_dropdown.change(
+                        fn=set_query,
+                        inputs=[task_dropdown],
+                        outputs=[chat_input],
+                    )
+                    clear_button = gr.ClearButton(
+                        components=[chatbot, chat_input, audio_input], visible=False
                     )
                     def start_chat_interface(audio_path):
+                        return (
+                            gr.update(visible=False),  # hide onboarding message
                             gr.update(visible=True),  # show upload section
+                            gr.update(visible=True),  # show chat box
                         )
                     audio_input.change(
                         fn=start_chat_interface,
                         inputs=[audio_input],
+                        outputs=[onboarding_message, upload_section, chat],
                     )
+                    chat_input.submit(
+                        add_message_and_get_response,
+                        inputs=[chatbot, audio_input, chat_input],
+                        outputs=[chatbot, chat_input],
+                    ).then(lambda: gr.ClearButton(visible=True), None, [clear_button])
+                    clear_button.click(
+                        lambda: gr.ClearButton(visible=False), None, [clear_button]
+                    )
             with gr.Tab("Sample Library"):
+                gr.Markdown("## Sample Library\n\nExplore example audio files below.")
                 gr.Examples(
                     list(examples.values()),
                     chatbot,
                     examples_per_page=20,
                 )
             with gr.Tab("💡 Help"):
+                gr.Markdown("## User Guide")  # to fill out
+                gr.Markdown("## Share Feedback")  # to fill out
+                gr.Markdown("## FAQs")  # to fill out
             app.css = """
             .welcome-banner {
                 background: transparent !important;
                 _batch_tab()
             with gr.Tab("Long Recording"):
                 _long_recording_tab() """
     return app
     assets_dir=Path("assets"),
     cfg_path=Path("configs/inference.yml"),
     options=[],
 )
 if __name__ == "__main__":
+    app.launch()

requirements.txt CHANGED Viewed

@@ -1,31 +1,19 @@
-torch>=2.2.2
-torchaudio>=2.2.2
-torchvision>=0.17.2
-transformers[sentencepiece]>=4.44.2
-datasets>=2.20.0
-cloudpathlib[gs]>=0.20.0
-einops>=0.8.0
-gradio>=5.10.0
-google-cloud-aiplatform>=1.76.0
-Levenshtein>=0.25.1
-librosa>=0.9.2
-memoization>=0.4.0
-mir-eval>=0.7
-numpy>=1.26.4
-pandas>=1.4.3
-peft>=0.11.1
-plumbum>=1.7.2
-pydantic-settings>=2.7.1
-pydantic>=2.7.4
-pydub>=0.25.1
-pyyaml>=6.0
-resampy>=0.3.1
-scipy>=1.14.0
-soundfile>=0.12.1
-tensorboard>=2.18.0
-tensorboardX>=2.6.2.2
-spaces>=0.39.0
-tqdm>=4.66.4
-wandb>=0.17.3
-click>=8.1.7
-git+https://github.com/earthspecies/beans-zero.git

+click>=8.2.1
+einops>=0.8.1
+gradio>=5.42.0
+librosa>=0.11.0
+pandas>=2.3.1
+peft>=0.17.0
+plumbum>=1.9.0
+pydantic>=2.11.7
+pydantic-settings>=2.10.1
+pyyaml>=6.0.2
+resampy>=0.4.3
+scipy>=1.15.3
+soundfile>=0.13.1
+spaces>=0.40.0
+torch>=2.8.0
+torchaudio>=2.8.0
+tqdm>=4.67.1
+transformers[sentencepiece]>=4.55.2
+matplotlib>=3.10.5