1
0
mirror of https://github.com/Kitware/CMake.git synced 2025-10-15 03:48:02 +08:00

cmListFileLexer: Test for broken UTF-32-(BE|LE) BOM

This commit is contained in:
Sergiu Deitsch
2025-09-11 15:13:49 +02:00
parent 3b8ddf3f45
commit ca072e3734
9 changed files with 26 additions and 4 deletions

View File

@@ -2715,6 +2715,7 @@ void cmListFileLexer_Delete(cmListFileLexer* lexer)
static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f)
{
unsigned char b[2];
size_t n;
if (fread(b, 1, 2, f) == 2) {
if (b[0] == 0xEF && b[1] == 0xBB) {
if (fread(b, 1, 1, f) == 1 && b[0] == 0xBF) {
@@ -2730,13 +2731,21 @@ static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f)
} else if (b[0] == 0xFF && b[1] == 0xFE) {
fpos_t p;
fgetpos(f, &p);
if (fread(b, 1, 2, f) == 2 && b[0] == 0 && b[1] == 0) {
n = fread(b, 1, 2, f);
if (n == 2 && b[0] == 0 && b[1] == 0) {
return cmListFileLexer_BOM_UTF32LE;
}
if (fsetpos(f, &p) != 0) {
return cmListFileLexer_BOM_Broken;
}
return cmListFileLexer_BOM_UTF16LE;
/* In case we were able to subsequently read only a single byte out of two
(i.e., three in total), the file must be corrupt and the BOM cannot
represent a UTF-16-LE BOM since each code unit must consist of two
bytes. This avoids incorrectly detecting an incomplete UTF-32-LE BOM as
UTF-16-LE input. */
if (n % 2 == 0) {
return cmListFileLexer_BOM_UTF16LE;
}
}
}
if (fseek(f, 0, SEEK_SET) != 0) {

View File

@@ -442,6 +442,7 @@ void cmListFileLexer_Delete(cmListFileLexer* lexer)
static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f)
{
unsigned char b[2];
size_t n;
if (fread(b, 1, 2, f) == 2) {
if (b[0] == 0xEF && b[1] == 0xBB) {
if (fread(b, 1, 1, f) == 1 && b[0] == 0xBF) {
@@ -457,13 +458,21 @@ static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f)
} else if (b[0] == 0xFF && b[1] == 0xFE) {
fpos_t p;
fgetpos(f, &p);
if (fread(b, 1, 2, f) == 2 && b[0] == 0 && b[1] == 0) {
n = fread(b, 1, 2, f);
if (n == 2 && b[0] == 0 && b[1] == 0) {
return cmListFileLexer_BOM_UTF32LE;
}
if (fsetpos(f, &p) != 0) {
return cmListFileLexer_BOM_Broken;
}
return cmListFileLexer_BOM_UTF16LE;
/* In case we were able to subsequently read only a single byte out of two
(i.e., three in total), the file must be corrupt and the BOM cannot
represent a UTF-16-LE BOM since each code unit must consist of two
bytes. This avoids incorrectly detecting an incomplete UTF-32-LE BOM as
UTF-16-LE input. */
if (n % 2 == 0) {
return cmListFileLexer_BOM_UTF16LE;
}
}
}
if (fseek(f, 0, SEEK_SET) != 0) {

View File

@@ -0,0 +1 @@
1

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1 @@
1

Binary file not shown.

Binary file not shown.

View File

@@ -5,6 +5,8 @@ run_cmake(BOM-UTF-16-LE)
run_cmake(BOM-UTF-16-BE)
run_cmake(BOM-UTF-32-LE)
run_cmake(BOM-UTF-32-BE)
run_cmake(Broken-BOM-UTF-32-LE)
run_cmake(Broken-BOM-UTF-32-BE)
run_cmake(CommandSpaces)
run_cmake(CommandTabs)
run_cmake(CommandNewlines)