// istuf8 II - what are the odds of random bytes being valid UTF-8?
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// doas pkg_add libutf8proc
#include <libutf8proc/utf8proc.h>
double
byteaudit(size_t len, size_t trials)
{
size_t pass = 0;
assert(len > 0);
utf8proc_int32_t ref;
switch (len) {
case 1: { // if we do not get 50% from this, something is wrong
utf8proc_uint8_t i = 0;
trials = 256;
do {
if (utf8proc_iterate(&i, 1, &ref) == 1) pass++;
} while (++i != 0);
break;
}
case 2: {
utf8proc_uint8_t ij[2] = {0};
uint16_t *n = (uint16_t *) &ij;
trials = 65536;
do {
utf8proc_ssize_t left = 2;
utf8proc_uint8_t *ip = ij;
while (1) {
utf8proc_ssize_t ret =
utf8proc_iterate(ip, left, &ref);
if (ret < 1) break;
ip += ret;
left -= ret;
if (left == 0) {
pass++;
break;
}
}
} while (++*n != 0);
break;
}
default: {
utf8proc_uint8_t *in = malloc(len);
if (!in) abort();
for (size_t i = 0; i < trials; i++) {
// PORTABILITY this may need libbsd or something else
arc4random_buf(in, len);
utf8proc_uint8_t *ip = in;
utf8proc_ssize_t left = (utf8proc_ssize_t) len;
while (1) {
utf8proc_ssize_t ret =
utf8proc_iterate(ip, left, &ref);
if (ret < 1) break;
ip += ret;
left -= ret;
if (left == 0) {
pass++;
break;
}
}
}
free(in);
}
}
return (double) pass / (double) trials * 100.0;
}
int
main(void)
{
for (int j = 0; j < 30; j++) {
for (size_t i = 1; i <= 8; i++)
printf("%.2lf ", byteaudit(i, 10000U));
putchar('\n');
}
}
Response:
20 (Success), text/plain