Skip to main content

Kserve Custom Resource


ServingRuntimes

python3 -m pip install kserve

Storage를 사용해야하는 경우 kserve[storage]를 설치해야합니다.

<project>
├── app/
│ ├── __init__.py
│ ├── runtime/
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ └── predictor.py
│ └── ...
└── ...
app/runtime/__main__.py
import argparse

from kserve import ModelServer, model_server
# from kserve.storage import Storage

from app.runtime.predictor import DEFAULT_MODEL_NAME, Predictor

parser = argparse.ArgumentParser(parents=[model_server.parser])
parser.add_argument("--model_dir", required=True, help="A URI pointer to the model binary")
parser.add_argument(
"--model_name", help="The name that the model is served under.", default=DEFAULT_MODEL_NAME
)
args, _ = parser.parse_known_args()

ModelServer().start([Predictor(args.model_name, args.model_dir)])
# ModelServer().start([Predictor(args.model_name, Storage.download(args.model_dir))])
app/runtime/predictor.py
from kserve import InferRequest, InferResponse, Model

DEFAULT_MODEL_NAME = "model"
DEFAULT_LOCAL_MODEL_DIR = "/tmp/model"


class Predictor(Model):
def __init__(self, name: str = DEFAULT_MODEL_NAME, model_dir: str = DEFAULT_LOCAL_MODEL_DIR):
super().__init__(name)

self._model_dir = model_dir

self.load()

def load(self):
self.ready = True

async def preprocess(
self, payload: InferRequest, headers: dict[str, str] | None = None
) -> InferRequest:
return payload

async def predict(
self, payload: InferRequest, headers: dict[str, str] | None = None
) -> InferResponse:
return InferResponse(
response_id=payload.id,
model_name=self.name,
infer_outputs={},
)

def postprocess(
self, response: InferResponse, headers: dict[str, str] | None = None
) -> InferResponse:
return response
python3 -m app.runtime

Kserve Args


  • built-in flags
    • --http_port=8080
    • --grpc_port=8081
    • --workers=1: RestAPI 프로세스
    • --max_threads=4: gRPC 스레드
    • --max_asyncio_workers=<thread>: 이벤트 루프 스레드, 기본값: min(32, CPU + 4)
    • --enable_grpc=true
    • --enable_docs_url=false
    • --enable_latency_logging=true
    • --configure_logging=true
    • --log_config_file=<path>
    • --access_log_format=<format>
  • 추가
    • --model_dir=<path>
    • --model_name=<name>

Transformer

Transformer는 ServingRuntimes에서 predict만 제외하고 구현하면 됩니다.

from kserve import InferRequest, InferResponse, Model

DEFAULT_MODEL_NAME = "model"


class Predictor(Model):
def __init__(self, name: str = DEFAULT_MODEL_NAME):
super().__init__(name)

async def preprocess(
self, payload: InferRequest, headers: dict[str, str] | None = None
) -> InferRequest:
return payload

def postprocess(
self, response: InferResponse, headers: dict[str, str] | None = None
) -> InferResponse:
return response