280 lines
8.6 KiB
Python
280 lines
8.6 KiB
Python
from __future__ import annotations
|
|
|
|
import os
|
|
from ctypes import (
|
|
c_bool,
|
|
c_char_p,
|
|
c_int,
|
|
c_uint8,
|
|
c_uint32,
|
|
c_float,
|
|
c_void_p,
|
|
c_size_t,
|
|
POINTER,
|
|
_Pointer, # type: ignore
|
|
Structure,
|
|
byref,
|
|
)
|
|
import pathlib
|
|
from typing import (
|
|
Union,
|
|
NewType,
|
|
Optional,
|
|
TYPE_CHECKING,
|
|
)
|
|
|
|
import llama_cpp.llama_cpp as llama_cpp
|
|
|
|
from llama_cpp._ctypes_extensions import (
|
|
load_shared_library,
|
|
ctypes_function_for_shared_library,
|
|
)
|
|
|
|
if TYPE_CHECKING:
|
|
from llama_cpp._ctypes_extensions import (
|
|
CtypesArray,
|
|
)
|
|
|
|
|
|
# Specify the base name of the shared library to load
|
|
_libmtmd_base_name = "mtmd"
|
|
_libmtmd_override_path = os.environ.get("MTMD_CPP_LIB")
|
|
_libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path()
|
|
|
|
# Load the library
|
|
_libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path)
|
|
|
|
ctypes_function = ctypes_function_for_shared_library(_libmtmd)
|
|
|
|
################################################
|
|
# mtmd.h types
|
|
################################################
|
|
|
|
# Opaque types
|
|
mtmd_context_p = NewType("mtmd_context_p", int)
|
|
mtmd_context_p_ctypes = c_void_p
|
|
|
|
mtmd_bitmap_p = NewType("mtmd_bitmap_p", int)
|
|
mtmd_bitmap_p_ctypes = c_void_p
|
|
|
|
mtmd_image_tokens_p = NewType("mtmd_image_tokens_p", int)
|
|
mtmd_image_tokens_p_ctypes = c_void_p
|
|
|
|
mtmd_input_chunk_p = NewType("mtmd_input_chunk_p", int)
|
|
mtmd_input_chunk_p_ctypes = c_void_p
|
|
|
|
mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int)
|
|
mtmd_input_chunks_p_ctypes = c_void_p
|
|
|
|
# Enums
|
|
MTMD_INPUT_CHUNK_TYPE_TEXT = 0
|
|
MTMD_INPUT_CHUNK_TYPE_IMAGE = 1
|
|
MTMD_INPUT_CHUNK_TYPE_AUDIO = 2
|
|
|
|
# Structures
|
|
class mtmd_context_params(Structure):
|
|
_fields_ = [
|
|
("use_gpu", c_bool),
|
|
("print_timings", c_bool),
|
|
("n_threads", c_int),
|
|
("verbosity", c_int), # ggml_log_level
|
|
("image_marker", c_char_p),
|
|
("media_marker", c_char_p),
|
|
]
|
|
|
|
class mtmd_input_text(Structure):
|
|
_fields_ = [
|
|
("text", c_char_p),
|
|
("add_special", c_bool),
|
|
("parse_special", c_bool),
|
|
]
|
|
|
|
################################################
|
|
# mtmd.h functions
|
|
################################################
|
|
|
|
# MTMD_API const char * mtmd_default_marker(void);
|
|
@ctypes_function("mtmd_default_marker", [], c_char_p)
|
|
def mtmd_default_marker() -> bytes:
|
|
...
|
|
|
|
# MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
|
|
@ctypes_function("mtmd_context_params_default", [], mtmd_context_params)
|
|
def mtmd_context_params_default() -> mtmd_context_params:
|
|
...
|
|
|
|
# MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
|
# const struct llama_model * text_model,
|
|
# const struct mtmd_context_params ctx_params);
|
|
@ctypes_function(
|
|
"mtmd_init_from_file",
|
|
[c_char_p, llama_cpp.llama_model_p_ctypes, mtmd_context_params],
|
|
mtmd_context_p_ctypes
|
|
)
|
|
def mtmd_init_from_file(
|
|
mmproj_fname: bytes,
|
|
text_model: llama_cpp.llama_model_p,
|
|
ctx_params: mtmd_context_params,
|
|
/,
|
|
) -> Optional[mtmd_context_p]:
|
|
...
|
|
|
|
# MTMD_API void mtmd_free(mtmd_context * ctx);
|
|
@ctypes_function("mtmd_free", [mtmd_context_p_ctypes], None)
|
|
def mtmd_free(ctx: mtmd_context_p, /):
|
|
...
|
|
|
|
# MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
|
|
@ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool)
|
|
def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool:
|
|
...
|
|
|
|
# MTMD_API mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data);
|
|
@ctypes_function(
|
|
"mtmd_bitmap_init",
|
|
[c_uint32, c_uint32, POINTER(c_uint8)],
|
|
mtmd_bitmap_p_ctypes
|
|
)
|
|
def mtmd_bitmap_init(
|
|
nx: Union[c_uint32, int],
|
|
ny: Union[c_uint32, int],
|
|
data: CtypesArray[c_uint8],
|
|
/,
|
|
) -> Optional[mtmd_bitmap_p]:
|
|
...
|
|
|
|
# MTMD_API void mtmd_bitmap_free(mtmd_bitmap * bitmap);
|
|
@ctypes_function("mtmd_bitmap_free", [mtmd_bitmap_p_ctypes], None)
|
|
def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /):
|
|
...
|
|
|
|
# MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
|
|
@ctypes_function("mtmd_input_chunks_init", [], mtmd_input_chunks_p_ctypes)
|
|
def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]:
|
|
...
|
|
|
|
# MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
|
|
@ctypes_function("mtmd_input_chunks_free", [mtmd_input_chunks_p_ctypes], None)
|
|
def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /):
|
|
...
|
|
|
|
# MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
|
|
@ctypes_function("mtmd_input_chunks_size", [mtmd_input_chunks_p_ctypes], c_size_t)
|
|
def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int:
|
|
...
|
|
|
|
# MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx);
|
|
@ctypes_function(
|
|
"mtmd_input_chunks_get",
|
|
[mtmd_input_chunks_p_ctypes, c_size_t],
|
|
mtmd_input_chunk_p_ctypes
|
|
)
|
|
def mtmd_input_chunks_get(
|
|
chunks: mtmd_input_chunks_p, idx: Union[c_size_t, int], /
|
|
) -> Optional[mtmd_input_chunk_p]:
|
|
...
|
|
|
|
# MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
# mtmd_input_chunks * output,
|
|
# const mtmd_input_text * text,
|
|
# const mtmd_bitmap ** bitmaps,
|
|
# size_t n_bitmaps);
|
|
@ctypes_function(
|
|
"mtmd_tokenize",
|
|
[
|
|
mtmd_context_p_ctypes,
|
|
mtmd_input_chunks_p_ctypes,
|
|
POINTER(mtmd_input_text),
|
|
POINTER(mtmd_bitmap_p_ctypes),
|
|
c_size_t,
|
|
],
|
|
c_int,
|
|
)
|
|
def mtmd_tokenize(
|
|
ctx: mtmd_context_p,
|
|
output: mtmd_input_chunks_p,
|
|
text: "_Pointer[mtmd_input_text]",
|
|
bitmaps: CtypesArray[mtmd_bitmap_p_ctypes],
|
|
n_bitmaps: Union[c_size_t, int],
|
|
/,
|
|
) -> int:
|
|
...
|
|
|
|
# MTMD_API size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk);
|
|
@ctypes_function("mtmd_input_chunk_get_n_tokens", [mtmd_input_chunk_p_ctypes], c_size_t)
|
|
def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int:
|
|
...
|
|
|
|
# MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk);
|
|
@ctypes_function("mtmd_input_chunk_get_type", [mtmd_input_chunk_p_ctypes], c_int)
|
|
def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int:
|
|
...
|
|
|
|
# MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output);
|
|
@ctypes_function(
|
|
"mtmd_input_chunk_get_tokens_text",
|
|
[mtmd_input_chunk_p_ctypes, POINTER(c_size_t)],
|
|
POINTER(llama_cpp.llama_token)
|
|
)
|
|
def mtmd_input_chunk_get_tokens_text(
|
|
chunk: mtmd_input_chunk_p, n_tokens_output: "_Pointer[c_size_t]", /
|
|
) -> Optional["_Pointer[llama_cpp.llama_token]"]:
|
|
...
|
|
|
|
################################################
|
|
# mtmd-helper.h functions
|
|
################################################
|
|
|
|
# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
|
|
@ctypes_function(
|
|
"mtmd_helper_bitmap_init_from_buf",
|
|
[mtmd_context_p_ctypes, POINTER(c_uint8), c_size_t],
|
|
mtmd_bitmap_p_ctypes
|
|
)
|
|
def mtmd_helper_bitmap_init_from_buf(
|
|
ctx: mtmd_context_p,
|
|
buf: CtypesArray[c_uint8],
|
|
length: Union[c_size_t, int],
|
|
/,
|
|
) -> Optional[mtmd_bitmap_p]:
|
|
...
|
|
|
|
# MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
|
|
@ctypes_function("mtmd_helper_get_n_tokens", [mtmd_input_chunks_p_ctypes], c_size_t)
|
|
def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int:
|
|
...
|
|
|
|
# MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
|
# struct llama_context * lctx,
|
|
# const mtmd_input_chunk * chunk,
|
|
# llama_pos n_past,
|
|
# llama_seq_id seq_id,
|
|
# int32_t n_batch,
|
|
# bool logits_last,
|
|
# llama_pos * new_n_past);
|
|
@ctypes_function(
|
|
"mtmd_helper_eval_chunk_single",
|
|
[
|
|
mtmd_context_p_ctypes,
|
|
llama_cpp.llama_context_p_ctypes,
|
|
mtmd_input_chunk_p_ctypes,
|
|
llama_cpp.llama_pos,
|
|
llama_cpp.llama_seq_id,
|
|
c_int,
|
|
c_bool,
|
|
POINTER(llama_cpp.llama_pos),
|
|
],
|
|
c_int,
|
|
)
|
|
def mtmd_helper_eval_chunk_single(
|
|
ctx: mtmd_context_p,
|
|
lctx: llama_cpp.llama_context_p,
|
|
chunk: mtmd_input_chunk_p,
|
|
n_past: llama_cpp.llama_pos,
|
|
seq_id: llama_cpp.llama_seq_id,
|
|
n_batch: Union[c_int, int],
|
|
logits_last: Union[c_bool, bool],
|
|
new_n_past: "_Pointer[llama_cpp.llama_pos]",
|
|
/,
|
|
) -> int:
|
|
...
|