UnicodeMetadata
The UnicodeMetadata
class provides utilities for embedding and extracting metadata using Unicode variation selectors. This class is the primary interface for embedding and verifying metadata with digital signatures.
Overview
Unicode variation selectors are special characters that modify the appearance of the preceding character. EncypherAI uses a specific range of these selectors (VS1-VS256) to encode binary data within text without changing its visible appearance.
The UnicodeMetadata
class provides methods to:
- Convert binary data to and from variation selectors
- Find suitable targets in text for embedding metadata
- Embed and extract metadata at the character level
- Sign and verify metadata using Ed25519 digital signatures
Class Definition
class UnicodeMetadata:
"""
Utilities for embedding and extracting metadata using Unicode variation selectors.
"""
# Unicode variation selector range (VS1-VS256)
VARIATION_SELECTOR_START = 0xFE00 # VS1
VARIATION_SELECTOR_END = 0xFE0F # VS16 (first range)
VARIATION_SELECTOR_START_2 = 0xE0100 # VS17
VARIATION_SELECTOR_END_2 = 0xE01EF # VS256 (second range)
Class Methods
to_variation_selector
@classmethod
def to_variation_selector(cls, byte: int) -> Optional[str]:
"""
Convert a byte value (0-255) to a Unicode variation selector.
Args:
byte: Integer value between 0-255
Returns:
Unicode variation selector character or None if the byte is out of range
"""
from_variation_selector
@classmethod
def from_variation_selector(cls, char: str) -> Optional[int]:
"""
Convert a Unicode variation selector to its byte value.
Args:
char: Unicode variation selector character
Returns:
Integer value between 0-255 or None if the character is not a variation selector
"""
is_variation_selector
@classmethod
def is_variation_selector(cls, char: str) -> bool:
"""
Check if a character is a Unicode variation selector.
Args:
char: Character to check
Returns:
True if the character is a variation selector, False otherwise
"""
find_targets
@classmethod
def find_targets(
cls,
text: str,
target: Union[str, MetadataTarget] = "whitespace"
) -> List[int]:
"""
Find suitable target positions in text for embedding metadata.
Args:
text: The text to analyze
target: Where to embed metadata. Can be a string ("whitespace", "punctuation",
"first_letter", "last_letter", "all_characters") or a MetadataTarget enum.
Returns:
List of character indices that are suitable for embedding metadata
"""
embed_bytes
@classmethod
def embed_bytes(
cls,
text: str,
data: bytes,
target: Union[str, MetadataTarget] = "whitespace"
) -> str:
"""
Embed binary data into text using Unicode variation selectors.
Args:
text: The text to embed data into
data: Binary data to embed
target: Where to embed data. Can be a string ("whitespace", "punctuation",
"first_letter", "last_letter", "all_characters") or a MetadataTarget enum.
Returns:
Text with embedded data
Raises:
ValueError: If there are not enough targets to embed all data
"""
extract_bytes
@classmethod
def extract_bytes(cls, text: str) -> bytes:
"""
Extract binary data embedded in text using Unicode variation selectors.
Args:
text: Text with embedded data
Returns:
Extracted binary data
"""
embed_metadata
@classmethod
def embed_metadata(
cls,
text: str,
metadata: Dict[str, Any],
private_key: PrivateKeyTypes,
target: str = "whitespace"
) -> str:
"""
Embed metadata into text using Unicode variation selectors.
Args:
text: The text to embed metadata into
metadata: Dictionary containing metadata to embed. Must include a 'key_id' field
that identifies the private key used for signing and a 'timestamp' field.
private_key: Ed25519 private key used to sign the metadata
target: Where to embed metadata. Can be "whitespace", "punctuation",
"first_letter", "last_letter", or "all_characters"
Returns:
Text with embedded metadata and digital signature
Raises:
ValueError: If there are not enough targets to embed all data, if metadata
doesn't contain a 'key_id' field, or if metadata doesn't contain a 'timestamp' field
"""
extract_metadata
@classmethod
def extract_metadata(cls, text: str) -> Optional[Dict[str, Any]]:
"""
Extract metadata embedded in text using Unicode variation selectors.
This method does NOT verify the digital signature.
Args:
text: Text with embedded metadata
Returns:
Extracted metadata dictionary or None if no metadata could be extracted
"""
verify_metadata
@classmethod
def verify_metadata(
cls,
text: str,
public_key_resolver: Callable[[str], Optional[PublicKeyTypes]]
) -> Tuple[bool, Optional[Dict[str, Any]]]:
"""
Verify the metadata embedded in text using digital signature verification.
Args:
text: Text with embedded metadata and digital signature
public_key_resolver: A function that takes a key_id and returns the corresponding
public key, or None if the key_id is not recognized
Returns:
Tuple containing:
- A boolean indicating whether the verification was successful
- The verified metadata if successful, otherwise None
Notes:
The verification process:
1. Extracts the metadata and signature from the text
2. Gets the key_id from the metadata
3. Resolves the public key using the provided resolver function
4. Verifies the signature using the public key
5. Returns (True, metadata) if verification succeeds, (False, None) otherwise
"""
MetadataTarget Enum
The MetadataTarget
enum defines the possible targets for embedding metadata:
class MetadataTarget(Enum):
"""Enum for specifying where to embed metadata in text"""
WHITESPACE = "whitespace" # Default - embed in whitespace
PUNCTUATION = "punctuation" # Embed in punctuation marks
FIRST_LETTER = "first_letter" # Embed in first letter of each word
LAST_LETTER = "last_letter" # Embed in last letter of each word
ALL_CHARACTERS = "all_characters" # Embed in all characters (not recommended)
NONE = "none" # Don't embed metadata (for testing/debugging)
Usage Example
from encypher.core.unicode_metadata import UnicodeMetadata
import time
# Sample text
text = "This is a sample text for embedding metadata."
# Find suitable targets for embedding
whitespace_targets = UnicodeMetadata.find_targets(text, MetadataTarget.WHITESPACE)
punctuation_targets = UnicodeMetadata.find_targets(text, MetadataTarget.PUNCTUATION)
print(f"Whitespace targets: {len(whitespace_targets)} positions")
print(f"Punctuation targets: {len(punctuation_targets)} positions")
# Embed metadata
metadata = {
"model_id": "gpt-4",
"timestamp": int(time.time()), # Unix/Epoch timestamp
"version": "1.0.0",
"key_id": "your-key-id" # Required for verification
}
encoded_text = UnicodeMetadata.embed_metadata(
text=text,
metadata=metadata,
private_key="your-private-key" # Use your private key here
)
print("\nOriginal text:")
print(text)
print("\nEncoded text (looks identical but contains embedded data):")
print(encoded_text)
# Extract the metadata
extracted_metadata = UnicodeMetadata.extract_metadata(encoded_text)
print(f"\nExtracted metadata: {extracted_metadata}")
# Verify the metadata
is_valid, verified_metadata = UnicodeMetadata.verify_metadata(
text=encoded_text,
public_key_resolver=lambda key_id: "your-public-key" # Use your public key resolver here
)
print(f"\nVerification result: {'✅ Verified' if is_valid else '❌ Failed'}")
print(f"Verified metadata: {verified_metadata}")
# Demonstrate variation selector conversion
byte_value = 65 # ASCII 'A'
vs_char = UnicodeMetadata.to_variation_selector(byte_value)
print(f"\nByte value {byte_value} converted to variation selector: U+{ord(vs_char):04X}")
back_to_byte = UnicodeMetadata.from_variation_selector(vs_char)
print(f"Variation selector converted back to byte: {back_to_byte}")
Advanced: Key Management
The UnicodeMetadata
class requires proper key management for secure operation:
from encypher.core.keys import generate_key_pair
from encypher.core.unicode_metadata import UnicodeMetadata
from cryptography.hazmat.primitives.asymmetric.types import PublicKeyTypes
from typing import Optional, Dict
import time
# Generate key pair
private_key, public_key = generate_key_pair()
key_id = "example-key-1"
# Store public key (in a real application, this would be a database or secure storage)
public_keys_store = {key_id: public_key}
# Create a resolver function
def resolve_public_key(key_id: str) -> Optional[PublicKeyTypes]:
return public_keys_store.get(key_id)
# Create metadata with key_id
metadata = {
"model_id": "gpt-4",
"timestamp": int(time.time()),
"version": "2.0.0",
"key_id": key_id # Required for verification
}
# Embed metadata with digital signature
encoded_text = UnicodeMetadata.embed_metadata(
text="This is a sample text.",
metadata=metadata,
private_key=private_key
)
# Later, verify the metadata
is_valid, verified_metadata = UnicodeMetadata.verify_metadata(
text=encoded_text,
public_key_resolver=resolve_public_key
)
Advanced: Handling Timestamps
The UnicodeMetadata
class works with timestamps as part of the metadata:
from encypher.core.unicode_metadata import UnicodeMetadata
import time
# Sample text
text = "This is a sample text for embedding metadata with timestamps."
# Create metadata with a timestamp
metadata = {
"model_id": "gpt-4",
"timestamp": int(time.time()), # Unix/Epoch timestamp
"version": "1.0.0",
"key_id": "your-key-id" # Required for verification
}
# Embed metadata
is_valid, encoded_text = UnicodeMetadata.embed_metadata(
text=text,
metadata=metadata,
private_key="your-private-key" # Use your private key here
)
print(f"Embedding successful: {is_valid}")
print(f"Encoded text: {encoded_text}")
# Extract metadata
is_valid, extracted_metadata = UnicodeMetadata.extract_metadata(encoded_text)
print(f"Extraction successful: {is_valid}")
print(f"Extracted metadata: {extracted_metadata}")
Related Classes
MetadataEncoder
: Higher-level interface for embedding and extracting metadataStreamingMetadataEncoder
: For handling streaming content