diff --git a/cli/Sources/Noora/Utilities/Terminal.swift b/cli/Sources/Noora/Utilities/Terminal.swift index b8937164..2c57ccb7 100644 --- a/cli/Sources/Noora/Utilities/Terminal.swift +++ b/cli/Sources/Noora/Utilities/Terminal.swift @@ -143,10 +143,11 @@ public struct Terminal: Terminaling { } public func readCharacter() -> Character? { - if let char = readRawCharacter() { - return Character(UnicodeScalar(UInt8(char))) + let reader = UTF8Reader { + guard let rawChar = readRawCharacter() else { return nil } + return UInt8(truncatingIfNeeded: rawChar) } - return nil + return reader.readCharacter() } /// Returns the size of the terminal if available. @@ -269,3 +270,49 @@ public struct Terminal: Terminaling { } } } + +/// A reader that decodes UTF-8 encoded bytes into characters. +struct UTF8Reader { + private let readByte: () -> UInt8? + + /// Creates a reader with the given byte source. + /// - Parameter readByte: A closure that returns the next byte, or `nil` if no more bytes are available. + init(readByte: @escaping () -> UInt8?) { + self.readByte = readByte + } + + func readCharacter() -> Character? { + guard let firstByte = readByte() else { return nil } + guard let length = sequenceLength(forFirstByte: firstByte) else { return nil } + guard let bytes = bytes(forSequenceOfLength: length, startingWith: firstByte) else { return nil } + return character(from: bytes) + } + + private func sequenceLength(forFirstByte byte: UInt8) -> Int? { + switch byte { + case 0x00 ... 0x7F: 1 // ASCII + case 0xC2 ... 0xDF: 2 // 2-byte sequence (0xC0-0xC1 are overlong encodings) + case 0xE0 ... 0xEF: 3 // 3-byte sequence + case 0xF0 ... 0xF4: 4 // 4-byte sequence (0xF5+ exceeds Unicode range) + default: nil + } + } + + private func bytes(forSequenceOfLength length: Int, startingWith firstByte: UInt8) -> [UInt8]? { + var result: [UInt8] = [firstByte] + for _ in 1 ..< length { + guard let byte = readByte(), isContinuationByte(byte) else { return nil } + result.append(byte) + } + return result + } + + private func isContinuationByte(_ byte: UInt8) -> Bool { + // UTF-8 continuation bytes have the pattern 10xxxxxx (0x80-0xBF) + (byte & 0xC0) == 0x80 + } + + private func character(from bytes: [UInt8]) -> Character? { + String(bytes: bytes, encoding: .utf8).flatMap(\.first) + } +} diff --git a/cli/Tests/NooraTests/Utilities/UTF8ReaderTests.swift b/cli/Tests/NooraTests/Utilities/UTF8ReaderTests.swift new file mode 100644 index 00000000..14cb4179 --- /dev/null +++ b/cli/Tests/NooraTests/Utilities/UTF8ReaderTests.swift @@ -0,0 +1,70 @@ +import Testing + +@testable import Noora + +struct UTF8ReaderTests { + @Test(arguments: TestCase.allCases) + func decodesSingleCharacter(testCase: TestCase) { + var iter = testCase.bytes.makeIterator() + let reader = UTF8Reader { iter.next() } + #expect(reader.readCharacter() == testCase.expected) + } + + @Test + func readsConsecutiveCharactersWithoutByteLeakage() { + let bytes: [UInt8] = [ + 0x41, // A (1-byte) + 0xC3, 0xA9, // é (2-byte) + 0xE4, 0xB8, 0xAD, // 中 (3-byte) + 0xF0, 0x9F, 0x98, 0x80, // 😀 (4-byte) + ] + var iter = bytes.makeIterator() + let reader = UTF8Reader { iter.next() } + + #expect(reader.readCharacter() == "A") + #expect(reader.readCharacter() == "é") + #expect(reader.readCharacter() == "中") + #expect(reader.readCharacter() == "😀") + #expect(reader.readCharacter() == nil) + } + + struct TestCase: CustomTestStringConvertible, Sendable { + let bytes: [UInt8] + let expected: Character? + let testDescription: String + + static let allCases: [TestCase] = [ + // 1-byte sequences (ASCII) + TestCase(bytes: [0x41], expected: "A", testDescription: "ASCII letter"), + TestCase(bytes: [0x00], expected: "\0", testDescription: "null character"), + TestCase(bytes: [0x7F], expected: "\u{7F}", testDescription: "ASCII max (DEL)"), + + // 2-byte sequences + TestCase(bytes: [0xC3, 0xA9], expected: "é", testDescription: "Latin: French e-acute"), + TestCase(bytes: [0xD0, 0x90], expected: "А", testDescription: "Cyrillic: Russian A"), + + // 3-byte sequences + TestCase(bytes: [0xE3, 0x81, 0x82], expected: "あ", testDescription: "Japanese hiragana"), + TestCase(bytes: [0xE4, 0xB8, 0xAD], expected: "中", testDescription: "Chinese hanzi"), + TestCase(bytes: [0xEA, 0xB0, 0x80], expected: "가", testDescription: "Korean hangul"), + TestCase(bytes: [0xE2, 0x82, 0xAC], expected: "€", testDescription: "Euro sign"), + + // 4-byte sequences + TestCase(bytes: [0xF0, 0x9F, 0x98, 0x80], expected: "😀", testDescription: "Emoji"), + TestCase(bytes: [0xF0, 0x9F, 0x87, 0xAF], expected: "🇯", testDescription: "Regional indicator J"), + + // Invalid sequences + TestCase(bytes: [0x80], expected: nil, testDescription: "Invalid: lone continuation byte"), + TestCase(bytes: [0xFF], expected: nil, testDescription: "Invalid: 0xFF is never valid"), + TestCase(bytes: [0xC3], expected: nil, testDescription: "Invalid: incomplete 2-byte sequence"), + TestCase(bytes: [0xE3, 0x81], expected: nil, testDescription: "Invalid: incomplete 3-byte sequence"), + TestCase(bytes: [0xF0, 0x9F, 0x98], expected: nil, testDescription: "Invalid: incomplete 4-byte sequence"), + TestCase(bytes: [0xC0, 0x80], expected: nil, testDescription: "Invalid: overlong encoding"), + TestCase(bytes: [0xF5, 0x80, 0x80, 0x80], expected: nil, testDescription: "Invalid: exceeds Unicode range"), + TestCase(bytes: [0xC3, 0x00], expected: nil, testDescription: "Invalid: bad continuation byte"), + + // Empty input + TestCase(bytes: [], expected: nil, testDescription: "Empty input"), + ] + } +}