apiVersion: serving.kserve.io/v1beta1 kind: InferenceService metadata: annotations: serving.kserve.io/enable-prometheus-scraping: "true" prometheus.kserve.io/port: "8000" prometheus.kserve.io/path: "/v1/metrics" name: nim-nv-rerankqa-llama-l namespace: default spec: predictor: tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule nodeSelector: nvidia.com/gpu: "true" imagePullSecrets: - name: edb-cred model: modelFormat: name: nvidia-nim-nv-rerankqa-llama resources: limits: nvidia.com/gpu: "1" requests: nvidia.com/gpu: "1" runtime: nvidia-nim-nv-rerankqa-llama