1From bd5e882cf6e0def3dd1bc106075d59a303fe0d1e Mon Sep 17 00:00:00 2001 2From: David Malcolm <dmalcolm@redhat.com> 3Date: Mon, 18 Oct 2021 18:55:31 -0400 4Subject: [PATCH] diagnostics: escape non-ASCII source bytes for certain 5 diagnostics 6MIME-Version: 1.0 7Content-Type: text/plain; charset=utf8 8Content-Transfer-Encoding: 8bit 9 10This patch adds support to GCC's diagnostic subsystem for escaping certain 11bytes and Unicode characters when quoting source code. 12 13Specifically, this patch adds a new flag rich_location::m_escape_on_output 14which is a hint from a diagnostic that non-ASCII bytes in the pertinent 15lines of the user's source code should be escaped when printed. 16 17The patch sets this for the following diagnostics: 18- when complaining about stray bytes in the program (when these 19are non-printable) 20- when complaining about "null character(s) ignored"); 21- for -Wnormalized= (and generate source ranges for such warnings) 22 23The escaping is controlled by a new option: 24 -fdiagnostics-escape-format=[unicode|bytes] 25 26For example, consider a diagnostic involing a source line containing the 27string "before" followed by the Unicode character U+03C0 ("GREEK SMALL 28LETTER PI", with UTF-8 encoding 0xCF 0x80) followed by the byte 0xBF 29(a stray UTF-8 trailing byte), followed by the string "after", where the 30diagnostic highlights the U+03C0 character. 31 32By default, this line will be printed verbatim to the user when 33reporting a diagnostic at it, as: 34 35 beforeÏXafter 36 ^ 37 38(using X for the stray byte to avoid putting invalid UTF-8 in this 39commit message) 40 41If the diagnostic sets the "escape" flag, it will be printed as: 42 43 before<U+03C0><BF>after 44 ^~~~~~~~ 45 46with -fdiagnostics-escape-format=unicode (the default), or as: 47 48 before<CF><80><BF>after 49 ^~~~~~~~ 50 51if the user supplies -fdiagnostics-escape-format=bytes. 52 53This only affects how the source is printed; it does not affect 54how column numbers that are printed (as per -fdiagnostics-column-unit= 55and -fdiagnostics-column-origin=). 56 57gcc/c-family/ChangeLog: 58 * c-lex.c (c_lex_with_flags): When complaining about non-printable 59 CPP_OTHER tokens, set the "escape on output" flag. 60 61gcc/ChangeLog: 62 * common.opt (fdiagnostics-escape-format=): New. 63 (diagnostics_escape_format): New enum. 64 (DIAGNOSTICS_ESCAPE_FORMAT_UNICODE): New enum value. 65 (DIAGNOSTICS_ESCAPE_FORMAT_BYTES): Likewise. 66 * diagnostic-format-json.cc (json_end_diagnostic): Add 67 "escape-source" attribute. 68 * diagnostic-show-locus.c 69 (exploc_with_display_col::exploc_with_display_col): Replace 70 "tabstop" param with a cpp_char_column_policy and add an "aspect" 71 param. Use these to compute m_display_col accordingly. 72 (struct char_display_policy): New struct. 73 (layout::m_policy): New field. 74 (layout::m_escape_on_output): New field. 75 (def_policy): New function. 76 (make_range): Update for changes to exploc_with_display_col ctor. 77 (default_print_decoded_ch): New. 78 (width_per_escaped_byte): New. 79 (escape_as_bytes_width): New. 80 (escape_as_bytes_print): New. 81 (escape_as_unicode_width): New. 82 (escape_as_unicode_print): New. 83 (make_policy): New. 84 (layout::layout): Initialize new fields. Update m_exploc ctor 85 call for above change to ctor. 86 (layout::maybe_add_location_range): Update for changes to 87 exploc_with_display_col ctor. 88 (layout::calculate_x_offset_display): Update for change to 89 cpp_display_width. 90 (layout::print_source_line): Pass policy 91 to cpp_display_width_computation. Capture cpp_decoded_char when 92 calling process_next_codepoint. Move printing of source code to 93 m_policy.m_print_cb. 94 (line_label::line_label): Pass in policy rather than context. 95 (layout::print_any_labels): Update for change to line_label ctor. 96 (get_affected_range): Pass in policy rather than context, updating 97 calls to location_compute_display_column accordingly. 98 (get_printed_columns): Likewise, also for cpp_display_width. 99 (correction::correction): Pass in policy rather than tabstop. 100 (correction::compute_display_cols): Pass m_policy rather than 101 m_tabstop to cpp_display_width. 102 (correction::m_tabstop): Replace with... 103 (correction::m_policy): ...this. 104 (line_corrections::line_corrections): Pass in policy rather than 105 context. 106 (line_corrections::m_context): Replace with... 107 (line_corrections::m_policy): ...this. 108 (line_corrections::add_hint): Update to use m_policy rather than 109 m_context. 110 (line_corrections::add_hint): Likewise. 111 (layout::print_trailing_fixits): Likewise. 112 (selftest::test_display_widths): New. 113 (selftest::test_layout_x_offset_display_utf8): Update to use 114 policy rather than tabstop. 115 (selftest::test_one_liner_labels_utf8): Add test of escaping 116 source lines. 117 (selftest::test_diagnostic_show_locus_one_liner_utf8): Update to 118 use policy rather than tabstop. 119 (selftest::test_overlapped_fixit_printing): Likewise. 120 (selftest::test_overlapped_fixit_printing_utf8): Likewise. 121 (selftest::test_overlapped_fixit_printing_2): Likewise. 122 (selftest::test_tab_expansion): Likewise. 123 (selftest::test_escaping_bytes_1): New. 124 (selftest::test_escaping_bytes_2): New. 125 (selftest::diagnostic_show_locus_c_tests): Call the new tests. 126 * diagnostic.c (diagnostic_initialize): Initialize 127 context->escape_format. 128 (convert_column_unit): Update to use default character width policy. 129 (selftest::test_diagnostic_get_location_text): Likewise. 130 * diagnostic.h (enum diagnostics_escape_format): New enum. 131 (diagnostic_context::escape_format): New field. 132 * doc/invoke.texi (-fdiagnostics-escape-format=): New option. 133 (-fdiagnostics-format=): Add "escape-source" attribute to examples 134 of JSON output, and document it. 135 * input.c (location_compute_display_column): Pass in "policy" 136 rather than "tabstop", passing to 137 cpp_byte_column_to_display_column. 138 (selftest::test_cpp_utf8): Update to use cpp_char_column_policy. 139 * input.h (class cpp_char_column_policy): New forward decl. 140 (location_compute_display_column): Pass in "policy" rather than 141 "tabstop". 142 * opts.c (common_handle_option): Handle 143 OPT_fdiagnostics_escape_format_. 144 * selftest.c (temp_source_file::temp_source_file): New ctor 145 overload taking a size_t. 146 * selftest.h (temp_source_file::temp_source_file): Likewise. 147 148gcc/testsuite/ChangeLog: 149 * c-c++-common/diagnostic-format-json-1.c: Add regexp to consume 150 "escape-source" attribute. 151 * c-c++-common/diagnostic-format-json-2.c: Likewise. 152 * c-c++-common/diagnostic-format-json-3.c: Likewise. 153 * c-c++-common/diagnostic-format-json-4.c: Likewise, twice. 154 * c-c++-common/diagnostic-format-json-5.c: Likewise. 155 * gcc.dg/cpp/warn-normalized-4-bytes.c: New test. 156 * gcc.dg/cpp/warn-normalized-4-unicode.c: New test. 157 * gcc.dg/encoding-issues-bytes.c: New test. 158 * gcc.dg/encoding-issues-unicode.c: New test. 159 * gfortran.dg/diagnostic-format-json-1.F90: Add regexp to consume 160 "escape-source" attribute. 161 * gfortran.dg/diagnostic-format-json-2.F90: Likewise. 162 * gfortran.dg/diagnostic-format-json-3.F90: Likewise. 163 164libcpp/ChangeLog: 165 * charset.c (convert_escape): Use encoding_rich_location when 166 complaining about nonprintable unknown escape sequences. 167 (cpp_display_width_computation::::cpp_display_width_computation): 168 Pass in policy rather than tabstop. 169 (cpp_display_width_computation::process_next_codepoint): Add "out" 170 param and populate *out if non-NULL. 171 (cpp_display_width_computation::advance_display_cols): Pass NULL 172 to process_next_codepoint. 173 (cpp_byte_column_to_display_column): Pass in policy rather than 174 tabstop. Pass NULL to process_next_codepoint. 175 (cpp_display_column_to_byte_column): Pass in policy rather than 176 tabstop. 177 * errors.c (cpp_diagnostic_get_current_location): New function, 178 splitting out the logic from... 179 (cpp_diagnostic): ...here. 180 (cpp_warning_at): New function. 181 (cpp_pedwarning_at): New function. 182 * include/cpplib.h (cpp_warning_at): New decl for rich_location. 183 (cpp_pedwarning_at): Likewise. 184 (struct cpp_decoded_char): New. 185 (struct cpp_char_column_policy): New. 186 (cpp_display_width_computation::cpp_display_width_computation): 187 Replace "tabstop" param with "policy". 188 (cpp_display_width_computation::process_next_codepoint): Add "out" 189 param. 190 (cpp_display_width_computation::m_tabstop): Replace with... 191 (cpp_display_width_computation::m_policy): ...this. 192 (cpp_byte_column_to_display_column): Replace "tabstop" param with 193 "policy". 194 (cpp_display_width): Likewise. 195 (cpp_display_column_to_byte_column): Likewise. 196 * include/line-map.h (rich_location::escape_on_output_p): New. 197 (rich_location::set_escape_on_output): New. 198 (rich_location::m_escape_on_output): New. 199 * internal.h (cpp_diagnostic_get_current_location): New decl. 200 (class encoding_rich_location): New. 201 * lex.c (skip_whitespace): Use encoding_rich_location when 202 complaining about null characters. 203 (warn_about_normalization): Generate a source range when 204 complaining about improperly normalized tokens, rather than just a 205 point, and use encoding_rich_location so that the source code 206 is escaped on printing. 207 * line-map.c (rich_location::rich_location): Initialize 208 m_escape_on_output. 209 210Signed-off-by: David Malcolm <dmalcolm@redhat.com> 211 212CVE: CVE-2021-42574 213Upstream-Status: Backport [https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=bd5e882cf6e0def3dd1bc106075d59a303fe0d1e] 214Signed-off-by: Pgowda <pgowda.cve@gmail.com> 215 216--- 217 gcc/c-family/c-lex.c | 6 +- 218 gcc/common.opt | 13 + 219 gcc/diagnostic-format-json.cc | 3 + 220 gcc/diagnostic-show-locus.c | 580 +++++++++++++++--- 221 gcc/diagnostic.c | 10 +- 222 gcc/diagnostic.h | 18 + 223 gcc/doc/invoke.texi | 43 +- 224 gcc/input.c | 62 +- 225 gcc/input.h | 7 +- 226 gcc/opts.c | 4 + 227 gcc/selftest.c | 15 + 228 gcc/selftest.h | 2 + 229 .../c-c++-common/diagnostic-format-json-1.c | 1 + 230 .../c-c++-common/diagnostic-format-json-2.c | 1 + 231 .../c-c++-common/diagnostic-format-json-3.c | 1 + 232 .../c-c++-common/diagnostic-format-json-4.c | 2 + 233 .../c-c++-common/diagnostic-format-json-5.c | 1 + 234 .../gcc.dg/cpp/warn-normalized-4-bytes.c | 21 + 235 .../gcc.dg/cpp/warn-normalized-4-unicode.c | 19 + 236 gcc/testsuite/gcc.dg/encoding-issues-bytes.c | Bin 0 -> 595 bytes 237 .../gcc.dg/encoding-issues-unicode.c | Bin 0 -> 613 bytes 238 .../gfortran.dg/diagnostic-format-json-1.F90 | 1 + 239 .../gfortran.dg/diagnostic-format-json-2.F90 | 1 + 240 .../gfortran.dg/diagnostic-format-json-3.F90 | 1 + 241 libcpp/charset.c | 63 +- 242 libcpp/errors.c | 82 ++- 243 libcpp/include/cpplib.h | 76 ++- 244 libcpp/include/line-map.h | 13 + 245 libcpp/internal.h | 23 + 246 libcpp/lex.c | 38 +- 247 libcpp/line-map.c | 3 +- 248 31 files changed, 942 insertions(+), 168 deletions(-) 249 create mode 100644 gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c 250 create mode 100644 gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c 251 create mode 100644 gcc/testsuite/gcc.dg/encoding-issues-bytes.c 252 create mode 100644 gcc/testsuite/gcc.dg/encoding-issues-unicode.c 253 254diff --git a/gcc/c-family/c-lex.c b/gcc/c-family/c-lex.c 255--- a/gcc/c-family/c-lex.c 2021-07-27 23:55:06.980283060 -0700 256+++ b/gcc/c-family/c-lex.c 2021-12-14 01:16:01.541943272 -0800 257@@ -603,7 +603,11 @@ c_lex_with_flags (tree *value, location_ 258 else if (ISGRAPH (c)) 259 error_at (*loc, "stray %qc in program", (int) c); 260 else 261- error_at (*loc, "stray %<\\%o%> in program", (int) c); 262+ { 263+ rich_location rich_loc (line_table, *loc); 264+ rich_loc.set_escape_on_output (true); 265+ error_at (&rich_loc, "stray %<\\%o%> in program", (int) c); 266+ } 267 } 268 goto retry; 269 270diff --git a/gcc/common.opt b/gcc/common.opt 271--- a/gcc/common.opt 2021-12-13 22:08:44.939137107 -0800 272+++ b/gcc/common.opt 2021-12-14 01:16:01.541943272 -0800 273@@ -1348,6 +1348,10 @@ fdiagnostics-format= 274 Common Joined RejectNegative Enum(diagnostics_output_format) 275 -fdiagnostics-format=[text|json] Select output format. 276 277+fdiagnostics-escape-format= 278+Common Joined RejectNegative Enum(diagnostics_escape_format) 279+-fdiagnostics-escape-format=[unicode|bytes] Select how to escape non-printable-ASCII bytes in the source for diagnostics that suggest it. 280+ 281 ; Required for these enum values. 282 SourceInclude 283 diagnostic.h 284@@ -1362,6 +1366,15 @@ EnumValue 285 Enum(diagnostics_column_unit) String(byte) Value(DIAGNOSTICS_COLUMN_UNIT_BYTE) 286 287 Enum 288+Name(diagnostics_escape_format) Type(int) 289+ 290+EnumValue 291+Enum(diagnostics_escape_format) String(unicode) Value(DIAGNOSTICS_ESCAPE_FORMAT_UNICODE) 292+ 293+EnumValue 294+Enum(diagnostics_escape_format) String(bytes) Value(DIAGNOSTICS_ESCAPE_FORMAT_BYTES) 295+ 296+Enum 297 Name(diagnostics_output_format) Type(int) 298 299 EnumValue 300diff --git a/gcc/diagnostic.c b/gcc/diagnostic.c 301--- a/gcc/diagnostic.c 2021-07-27 23:55:07.232286576 -0700 302+++ b/gcc/diagnostic.c 2021-12-14 01:16:01.545943202 -0800 303@@ -230,6 +230,7 @@ diagnostic_initialize (diagnostic_contex 304 context->column_unit = DIAGNOSTICS_COLUMN_UNIT_DISPLAY; 305 context->column_origin = 1; 306 context->tabstop = 8; 307+ context->escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE; 308 context->edit_context_ptr = NULL; 309 context->diagnostic_group_nesting_depth = 0; 310 context->diagnostic_group_emission_count = 0; 311@@ -382,7 +383,10 @@ convert_column_unit (enum diagnostics_co 312 gcc_unreachable (); 313 314 case DIAGNOSTICS_COLUMN_UNIT_DISPLAY: 315- return location_compute_display_column (s, tabstop); 316+ { 317+ cpp_char_column_policy policy (tabstop, cpp_wcwidth); 318+ return location_compute_display_column (s, policy); 319+ } 320 321 case DIAGNOSTICS_COLUMN_UNIT_BYTE: 322 return s.column; 323@@ -2275,8 +2279,8 @@ test_diagnostic_get_location_text () 324 const char *const content = "smile \xf0\x9f\x98\x82\n"; 325 const int line_bytes = strlen (content) - 1; 326 const int def_tabstop = 8; 327- const int display_width = cpp_display_width (content, line_bytes, 328- def_tabstop); 329+ const cpp_char_column_policy policy (def_tabstop, cpp_wcwidth); 330+ const int display_width = cpp_display_width (content, line_bytes, policy); 331 ASSERT_EQ (line_bytes - 2, display_width); 332 temp_source_file tmp (SELFTEST_LOCATION, ".c", content); 333 const char *const fname = tmp.get_filename (); 334diff --git a/gcc/diagnostic-format-json.cc b/gcc/diagnostic-format-json.cc 335--- a/gcc/diagnostic-format-json.cc 2021-07-27 23:55:07.232286576 -0700 336+++ b/gcc/diagnostic-format-json.cc 2021-12-14 01:16:01.541943272 -0800 337@@ -264,6 +264,9 @@ json_end_diagnostic (diagnostic_context 338 json::value *path_value = context->make_json_for_path (context, path); 339 diag_obj->set ("path", path_value); 340 } 341+ 342+ diag_obj->set ("escape-source", 343+ new json::literal (richloc->escape_on_output_p ())); 344 } 345 346 /* No-op implementation of "begin_group_cb" for JSON output. */ 347diff --git a/gcc/diagnostic.h b/gcc/diagnostic.h 348--- a/gcc/diagnostic.h 2021-07-27 23:55:07.236286632 -0700 349+++ b/gcc/diagnostic.h 2021-12-14 01:16:01.545943202 -0800 350@@ -38,6 +38,20 @@ enum diagnostics_column_unit 351 DIAGNOSTICS_COLUMN_UNIT_BYTE 352 }; 353 354+/* An enum for controlling how to print non-ASCII characters/bytes when 355+ a diagnostic suggests escaping the source code on output. */ 356+ 357+enum diagnostics_escape_format 358+{ 359+ /* Escape non-ASCII Unicode characters in the form <U+XXXX> and 360+ non-UTF-8 bytes in the form <XX>. */ 361+ DIAGNOSTICS_ESCAPE_FORMAT_UNICODE, 362+ 363+ /* Escape non-ASCII bytes in the form <XX> (thus showing the underlying 364+ encoding of non-ASCII Unicode characters). */ 365+ DIAGNOSTICS_ESCAPE_FORMAT_BYTES 366+}; 367+ 368 /* Enum for overriding the standard output format. */ 369 370 enum diagnostics_output_format 371@@ -320,6 +334,10 @@ struct diagnostic_context 372 /* The size of the tabstop for tab expansion. */ 373 int tabstop; 374 375+ /* How should non-ASCII/non-printable bytes be escaped when 376+ a diagnostic suggests escaping the source code on output. */ 377+ enum diagnostics_escape_format escape_format; 378+ 379 /* If non-NULL, an edit_context to which fix-it hints should be 380 applied, for generating patches. */ 381 edit_context *edit_context_ptr; 382diff --git a/gcc/diagnostic-show-locus.c b/gcc/diagnostic-show-locus.c 383--- a/gcc/diagnostic-show-locus.c 2021-07-27 23:55:07.232286576 -0700 384+++ b/gcc/diagnostic-show-locus.c 2021-12-14 01:16:01.545943202 -0800 385@@ -175,10 +175,26 @@ enum column_unit { 386 class exploc_with_display_col : public expanded_location 387 { 388 public: 389- exploc_with_display_col (const expanded_location &exploc, int tabstop) 390- : expanded_location (exploc), 391- m_display_col (location_compute_display_column (exploc, tabstop)) 392- {} 393+ exploc_with_display_col (const expanded_location &exploc, 394+ const cpp_char_column_policy &policy, 395+ enum location_aspect aspect) 396+ : expanded_location (exploc), 397+ m_display_col (location_compute_display_column (exploc, policy)) 398+ { 399+ if (exploc.column > 0) 400+ { 401+ /* m_display_col is now the final column of the byte. 402+ If escaping has happened, we may want the first column instead. */ 403+ if (aspect != LOCATION_ASPECT_FINISH) 404+ { 405+ expanded_location prev_exploc (exploc); 406+ prev_exploc.column--; 407+ int prev_display_col 408+ = (location_compute_display_column (prev_exploc, policy)); 409+ m_display_col = prev_display_col + 1; 410+ } 411+ } 412+ } 413 414 int m_display_col; 415 }; 416@@ -313,6 +329,31 @@ test_line_span () 417 418 #endif /* #if CHECKING_P */ 419 420+/* A bundle of information containing how to print unicode 421+ characters and bytes when quoting source code. 422+ 423+ Provides a unified place to support escaping some subset 424+ of characters to some format. 425+ 426+ Extends char_column_policy; printing is split out to avoid 427+ libcpp having to know about pretty_printer. */ 428+ 429+struct char_display_policy : public cpp_char_column_policy 430+{ 431+ public: 432+ char_display_policy (int tabstop, 433+ int (*width_cb) (cppchar_t c), 434+ void (*print_cb) (pretty_printer *pp, 435+ const cpp_decoded_char &cp)) 436+ : cpp_char_column_policy (tabstop, width_cb), 437+ m_print_cb (print_cb) 438+ { 439+ } 440+ 441+ void (*m_print_cb) (pretty_printer *pp, 442+ const cpp_decoded_char &cp); 443+}; 444+ 445 /* A class to control the overall layout when printing a diagnostic. 446 447 The layout is determined within the constructor. 448@@ -345,6 +386,8 @@ class layout 449 450 void print_line (linenum_type row); 451 452+ void on_bad_codepoint (const char *ptr, cppchar_t ch, size_t ch_sz); 453+ 454 private: 455 bool will_show_line_p (linenum_type row) const; 456 void print_leading_fixits (linenum_type row); 457@@ -386,6 +429,7 @@ class layout 458 private: 459 diagnostic_context *m_context; 460 pretty_printer *m_pp; 461+ char_display_policy m_policy; 462 location_t m_primary_loc; 463 exploc_with_display_col m_exploc; 464 colorizer m_colorizer; 465@@ -398,6 +442,7 @@ class layout 466 auto_vec <line_span> m_line_spans; 467 int m_linenum_width; 468 int m_x_offset_display; 469+ bool m_escape_on_output; 470 }; 471 472 /* Implementation of "class colorizer". */ 473@@ -646,6 +691,11 @@ layout_range::intersects_line_p (linenum 474 /* Default for when we don't care what the tab expansion is set to. */ 475 static const int def_tabstop = 8; 476 477+static cpp_char_column_policy def_policy () 478+{ 479+ return cpp_char_column_policy (8, cpp_wcwidth); 480+} 481+ 482 /* Create some expanded locations for testing layout_range. The filename 483 member of the explocs is set to the empty string. This member will only be 484 inspected by the calls to location_compute_display_column() made from the 485@@ -662,10 +712,13 @@ make_range (int start_line, int start_co 486 = {"", start_line, start_col, NULL, false}; 487 const expanded_location finish_exploc 488 = {"", end_line, end_col, NULL, false}; 489- return layout_range (exploc_with_display_col (start_exploc, def_tabstop), 490- exploc_with_display_col (finish_exploc, def_tabstop), 491+ return layout_range (exploc_with_display_col (start_exploc, def_policy (), 492+ LOCATION_ASPECT_START), 493+ exploc_with_display_col (finish_exploc, def_policy (), 494+ LOCATION_ASPECT_FINISH), 495 SHOW_RANGE_WITHOUT_CARET, 496- exploc_with_display_col (start_exploc, def_tabstop), 497+ exploc_with_display_col (start_exploc, def_policy (), 498+ LOCATION_ASPECT_CARET), 499 0, NULL); 500 } 501 502@@ -959,6 +1012,164 @@ fixit_cmp (const void *p_a, const void * 503 return hint_a->get_start_loc () - hint_b->get_start_loc (); 504 } 505 506+/* Callbacks for use when not escaping the source. */ 507+ 508+/* The default callback for char_column_policy::m_width_cb is cpp_wcwidth. */ 509+ 510+/* Callback for char_display_policy::m_print_cb for printing source chars 511+ when not escaping the source. */ 512+ 513+static void 514+default_print_decoded_ch (pretty_printer *pp, 515+ const cpp_decoded_char &decoded_ch) 516+{ 517+ for (const char *ptr = decoded_ch.m_start_byte; 518+ ptr != decoded_ch.m_next_byte; ptr++) 519+ { 520+ if (*ptr == '\0' || *ptr == '\r') 521+ { 522+ pp_space (pp); 523+ continue; 524+ } 525+ 526+ pp_character (pp, *ptr); 527+ } 528+} 529+ 530+/* Callbacks for use with DIAGNOSTICS_ESCAPE_FORMAT_BYTES. */ 531+ 532+static const int width_per_escaped_byte = 4; 533+ 534+/* Callback for char_column_policy::m_width_cb for determining the 535+ display width when escaping with DIAGNOSTICS_ESCAPE_FORMAT_BYTES. */ 536+ 537+static int 538+escape_as_bytes_width (cppchar_t ch) 539+{ 540+ if (ch < 0x80 && ISPRINT (ch)) 541+ return cpp_wcwidth (ch); 542+ else 543+ { 544+ if (ch <= 0x7F) return 1 * width_per_escaped_byte; 545+ if (ch <= 0x7FF) return 2 * width_per_escaped_byte; 546+ if (ch <= 0xFFFF) return 3 * width_per_escaped_byte; 547+ return 4 * width_per_escaped_byte; 548+ } 549+} 550+ 551+/* Callback for char_display_policy::m_print_cb for printing source chars 552+ when escaping with DIAGNOSTICS_ESCAPE_FORMAT_BYTES. */ 553+ 554+static void 555+escape_as_bytes_print (pretty_printer *pp, 556+ const cpp_decoded_char &decoded_ch) 557+{ 558+ if (!decoded_ch.m_valid_ch) 559+ { 560+ for (const char *iter = decoded_ch.m_start_byte; 561+ iter != decoded_ch.m_next_byte; ++iter) 562+ { 563+ char buf[16]; 564+ sprintf (buf, "<%02x>", (unsigned char)*iter); 565+ pp_string (pp, buf); 566+ } 567+ return; 568+ } 569+ 570+ cppchar_t ch = decoded_ch.m_ch; 571+ if (ch < 0x80 && ISPRINT (ch)) 572+ pp_character (pp, ch); 573+ else 574+ { 575+ for (const char *iter = decoded_ch.m_start_byte; 576+ iter < decoded_ch.m_next_byte; ++iter) 577+ { 578+ char buf[16]; 579+ sprintf (buf, "<%02x>", (unsigned char)*iter); 580+ pp_string (pp, buf); 581+ } 582+ } 583+} 584+ 585+/* Callbacks for use with DIAGNOSTICS_ESCAPE_FORMAT_UNICODE. */ 586+ 587+/* Callback for char_column_policy::m_width_cb for determining the 588+ display width when escaping with DIAGNOSTICS_ESCAPE_FORMAT_UNICODE. */ 589+ 590+static int 591+escape_as_unicode_width (cppchar_t ch) 592+{ 593+ if (ch < 0x80 && ISPRINT (ch)) 594+ return cpp_wcwidth (ch); 595+ else 596+ { 597+ // Width of "<U+%04x>" 598+ if (ch > 0xfffff) 599+ return 10; 600+ else if (ch > 0xffff) 601+ return 9; 602+ else 603+ return 8; 604+ } 605+} 606+ 607+/* Callback for char_display_policy::m_print_cb for printing source chars 608+ when escaping with DIAGNOSTICS_ESCAPE_FORMAT_UNICODE. */ 609+ 610+static void 611+escape_as_unicode_print (pretty_printer *pp, 612+ const cpp_decoded_char &decoded_ch) 613+{ 614+ if (!decoded_ch.m_valid_ch) 615+ { 616+ escape_as_bytes_print (pp, decoded_ch); 617+ return; 618+ } 619+ 620+ cppchar_t ch = decoded_ch.m_ch; 621+ if (ch < 0x80 && ISPRINT (ch)) 622+ pp_character (pp, ch); 623+ else 624+ { 625+ char buf[16]; 626+ sprintf (buf, "<U+%04X>", ch); 627+ pp_string (pp, buf); 628+ } 629+} 630+ 631+/* Populate a char_display_policy based on DC and RICHLOC. */ 632+ 633+static char_display_policy 634+make_policy (const diagnostic_context &dc, 635+ const rich_location &richloc) 636+{ 637+ /* The default is to not escape non-ASCII bytes. */ 638+ char_display_policy result 639+ (dc.tabstop, cpp_wcwidth, default_print_decoded_ch); 640+ 641+ /* If the diagnostic suggests escaping non-ASCII bytes, then 642+ use policy from user-supplied options. */ 643+ if (richloc.escape_on_output_p ()) 644+ { 645+ result.m_undecoded_byte_width = width_per_escaped_byte; 646+ switch (dc.escape_format) 647+ { 648+ default: 649+ gcc_unreachable (); 650+ case DIAGNOSTICS_ESCAPE_FORMAT_UNICODE: 651+ result.m_width_cb = escape_as_unicode_width; 652+ result.m_print_cb = escape_as_unicode_print; 653+ break; 654+ case DIAGNOSTICS_ESCAPE_FORMAT_BYTES: 655+ result.m_width_cb = escape_as_bytes_width; 656+ result.m_print_cb = escape_as_bytes_print; 657+ break; 658+ } 659+ } 660+ 661+ return result; 662+} 663+ 664 /* Implementation of class layout. */ 665 666 /* Constructor for class layout. 667@@ -975,8 +1186,10 @@ layout::layout (diagnostic_context * con 668 diagnostic_t diagnostic_kind) 669 : m_context (context), 670 m_pp (context->printer), 671+ m_policy (make_policy (*context, *richloc)), 672 m_primary_loc (richloc->get_range (0)->m_loc), 673- m_exploc (richloc->get_expanded_location (0), context->tabstop), 674+ m_exploc (richloc->get_expanded_location (0), m_policy, 675+ LOCATION_ASPECT_CARET), 676 m_colorizer (context, diagnostic_kind), 677 m_colorize_source_p (context->colorize_source_p), 678 m_show_labels_p (context->show_labels_p), 679@@ -986,7 +1199,8 @@ layout::layout (diagnostic_context * con 680 m_fixit_hints (richloc->get_num_fixit_hints ()), 681 m_line_spans (1 + richloc->get_num_locations ()), 682 m_linenum_width (0), 683- m_x_offset_display (0) 684+ m_x_offset_display (0), 685+ m_escape_on_output (richloc->escape_on_output_p ()) 686 { 687 for (unsigned int idx = 0; idx < richloc->get_num_locations (); idx++) 688 { 689@@ -1072,10 +1286,13 @@ layout::maybe_add_location_range (const 690 691 /* Everything is now known to be in the correct source file, 692 but it may require further sanitization. */ 693- layout_range ri (exploc_with_display_col (start, m_context->tabstop), 694- exploc_with_display_col (finish, m_context->tabstop), 695+ layout_range ri (exploc_with_display_col (start, m_policy, 696+ LOCATION_ASPECT_START), 697+ exploc_with_display_col (finish, m_policy, 698+ LOCATION_ASPECT_FINISH), 699 loc_range->m_range_display_kind, 700- exploc_with_display_col (caret, m_context->tabstop), 701+ exploc_with_display_col (caret, m_policy, 702+ LOCATION_ASPECT_CARET), 703 original_idx, loc_range->m_label); 704 705 /* If we have a range that finishes before it starts (perhaps 706@@ -1409,7 +1626,7 @@ layout::calculate_x_offset_display () 707 = get_line_bytes_without_trailing_whitespace (line.get_buffer (), 708 line.length ()); 709 int eol_display_column 710- = cpp_display_width (line.get_buffer (), line_bytes, m_context->tabstop); 711+ = cpp_display_width (line.get_buffer (), line_bytes, m_policy); 712 if (caret_display_column > eol_display_column 713 || !caret_display_column) 714 { 715@@ -1488,7 +1705,7 @@ layout::print_source_line (linenum_type 716 /* This object helps to keep track of which display column we are at, which is 717 necessary for computing the line bounds in display units, for doing 718 tab expansion, and for implementing m_x_offset_display. */ 719- cpp_display_width_computation dw (line, line_bytes, m_context->tabstop); 720+ cpp_display_width_computation dw (line, line_bytes, m_policy); 721 722 /* Skip the first m_x_offset_display display columns. In case the leading 723 portion that will be skipped ends with a character with wcwidth > 1, then 724@@ -1536,7 +1753,8 @@ layout::print_source_line (linenum_type 725 tabs and replacing some control bytes with spaces as necessary. */ 726 const char *c = dw.next_byte (); 727 const int start_disp_col = dw.display_cols_processed () + 1; 728- const int this_display_width = dw.process_next_codepoint (); 729+ cpp_decoded_char cp; 730+ const int this_display_width = dw.process_next_codepoint (&cp); 731 if (*c == '\t') 732 { 733 /* The returned display width is the number of spaces into which the 734@@ -1545,15 +1763,6 @@ layout::print_source_line (linenum_type 735 pp_space (m_pp); 736 continue; 737 } 738- if (*c == '\0' || *c == '\r') 739- { 740- /* cpp_wcwidth() promises to return 1 for all control bytes, and we 741- want to output these as a single space too, so this case is 742- actually the same as the '\t' case. */ 743- gcc_assert (this_display_width == 1); 744- pp_space (m_pp); 745- continue; 746- } 747 748 /* We have a (possibly multibyte) character to output; update the line 749 bounds if it is not whitespace. */ 750@@ -1565,7 +1774,8 @@ layout::print_source_line (linenum_type 751 } 752 753 /* Output the character. */ 754- while (c != dw.next_byte ()) pp_character (m_pp, *c++); 755+ m_policy.m_print_cb (m_pp, cp); 756+ c = dw.next_byte (); 757 } 758 print_newline (); 759 return lbounds; 760@@ -1664,14 +1874,14 @@ layout::print_annotation_line (linenum_t 761 class line_label 762 { 763 public: 764- line_label (diagnostic_context *context, int state_idx, int column, 765+ line_label (const cpp_char_column_policy &policy, 766+ int state_idx, int column, 767 label_text text) 768 : m_state_idx (state_idx), m_column (column), 769 m_text (text), m_label_line (0), m_has_vbar (true) 770 { 771 const int bytes = strlen (text.m_buffer); 772- m_display_width 773- = cpp_display_width (text.m_buffer, bytes, context->tabstop); 774+ m_display_width = cpp_display_width (text.m_buffer, bytes, policy); 775 } 776 777 /* Sorting is primarily by column, then by state index. */ 778@@ -1731,7 +1941,7 @@ layout::print_any_labels (linenum_type r 779 if (text.m_buffer == NULL) 780 continue; 781 782- labels.safe_push (line_label (m_context, i, disp_col, text)); 783+ labels.safe_push (line_label (m_policy, i, disp_col, text)); 784 } 785 } 786 787@@ -2011,7 +2221,7 @@ public: 788 789 /* Get the range of bytes or display columns that HINT would affect. */ 790 static column_range 791-get_affected_range (diagnostic_context *context, 792+get_affected_range (const cpp_char_column_policy &policy, 793 const fixit_hint *hint, enum column_unit col_unit) 794 { 795 expanded_location exploc_start = expand_location (hint->get_start_loc ()); 796@@ -2022,13 +2232,11 @@ get_affected_range (diagnostic_context * 797 int finish_column; 798 if (col_unit == CU_DISPLAY_COLS) 799 { 800- start_column 801- = location_compute_display_column (exploc_start, context->tabstop); 802+ start_column = location_compute_display_column (exploc_start, policy); 803 if (hint->insertion_p ()) 804 finish_column = start_column - 1; 805 else 806- finish_column 807- = location_compute_display_column (exploc_finish, context->tabstop); 808+ finish_column = location_compute_display_column (exploc_finish, policy); 809 } 810 else 811 { 812@@ -2041,12 +2249,13 @@ get_affected_range (diagnostic_context * 813 /* Get the range of display columns that would be printed for HINT. */ 814 815 static column_range 816-get_printed_columns (diagnostic_context *context, const fixit_hint *hint) 817+get_printed_columns (const cpp_char_column_policy &policy, 818+ const fixit_hint *hint) 819 { 820 expanded_location exploc = expand_location (hint->get_start_loc ()); 821- int start_column = location_compute_display_column (exploc, context->tabstop); 822+ int start_column = location_compute_display_column (exploc, policy); 823 int hint_width = cpp_display_width (hint->get_string (), hint->get_length (), 824- context->tabstop); 825+ policy); 826 int final_hint_column = start_column + hint_width - 1; 827 if (hint->insertion_p ()) 828 { 829@@ -2056,8 +2265,7 @@ get_printed_columns (diagnostic_context 830 { 831 exploc = expand_location (hint->get_next_loc ()); 832 --exploc.column; 833- int finish_column 834- = location_compute_display_column (exploc, context->tabstop); 835+ int finish_column = location_compute_display_column (exploc, policy); 836 return column_range (start_column, 837 MAX (finish_column, final_hint_column)); 838 } 839@@ -2075,13 +2283,13 @@ public: 840 column_range affected_columns, 841 column_range printed_columns, 842 const char *new_text, size_t new_text_len, 843- int tabstop) 844+ const cpp_char_column_policy &policy) 845 : m_affected_bytes (affected_bytes), 846 m_affected_columns (affected_columns), 847 m_printed_columns (printed_columns), 848 m_text (xstrdup (new_text)), 849 m_byte_length (new_text_len), 850- m_tabstop (tabstop), 851+ m_policy (policy), 852 m_alloc_sz (new_text_len + 1) 853 { 854 compute_display_cols (); 855@@ -2099,7 +2307,7 @@ public: 856 857 void compute_display_cols () 858 { 859- m_display_cols = cpp_display_width (m_text, m_byte_length, m_tabstop); 860+ m_display_cols = cpp_display_width (m_text, m_byte_length, m_policy); 861 } 862 863 void overwrite (int dst_offset, const char_span &src_span) 864@@ -2127,7 +2335,7 @@ public: 865 char *m_text; 866 size_t m_byte_length; /* Not including null-terminator. */ 867 int m_display_cols; 868- int m_tabstop; 869+ const cpp_char_column_policy &m_policy; 870 size_t m_alloc_sz; 871 }; 872 873@@ -2163,15 +2371,16 @@ correction::ensure_terminated () 874 class line_corrections 875 { 876 public: 877- line_corrections (diagnostic_context *context, const char *filename, 878+ line_corrections (const char_display_policy &policy, 879+ const char *filename, 880 linenum_type row) 881- : m_context (context), m_filename (filename), m_row (row) 882+ : m_policy (policy), m_filename (filename), m_row (row) 883 {} 884 ~line_corrections (); 885 886 void add_hint (const fixit_hint *hint); 887 888- diagnostic_context *m_context; 889+ const char_display_policy &m_policy; 890 const char *m_filename; 891 linenum_type m_row; 892 auto_vec <correction *> m_corrections; 893@@ -2217,10 +2426,10 @@ source_line::source_line (const char *fi 894 void 895 line_corrections::add_hint (const fixit_hint *hint) 896 { 897- column_range affected_bytes = get_affected_range (m_context, hint, CU_BYTES); 898- column_range affected_columns = get_affected_range (m_context, hint, 899+ column_range affected_bytes = get_affected_range (m_policy, hint, CU_BYTES); 900+ column_range affected_columns = get_affected_range (m_policy, hint, 901 CU_DISPLAY_COLS); 902- column_range printed_columns = get_printed_columns (m_context, hint); 903+ column_range printed_columns = get_printed_columns (m_policy, hint); 904 905 /* Potentially consolidate. */ 906 if (!m_corrections.is_empty ()) 907@@ -2289,7 +2498,7 @@ line_corrections::add_hint (const fixit_ 908 printed_columns, 909 hint->get_string (), 910 hint->get_length (), 911- m_context->tabstop)); 912+ m_policy)); 913 } 914 915 /* If there are any fixit hints on source line ROW, print them. 916@@ -2303,7 +2512,7 @@ layout::print_trailing_fixits (linenum_t 917 { 918 /* Build a list of correction instances for the line, 919 potentially consolidating hints (for the sake of readability). */ 920- line_corrections corrections (m_context, m_exploc.file, row); 921+ line_corrections corrections (m_policy, m_exploc.file, row); 922 for (unsigned int i = 0; i < m_fixit_hints.length (); i++) 923 { 924 const fixit_hint *hint = m_fixit_hints[i]; 925@@ -2646,6 +2855,59 @@ namespace selftest { 926 927 /* Selftests for diagnostic_show_locus. */ 928 929+/* Verify that cpp_display_width correctly handles escaping. */ 930+ 931+static void 932+test_display_widths () 933+{ 934+ gcc_rich_location richloc (UNKNOWN_LOCATION); 935+ 936+ /* U+03C0 "GREEK SMALL LETTER PI". */ 937+ const char *pi = "\xCF\x80"; 938+ /* U+1F642 "SLIGHTLY SMILING FACE". */ 939+ const char *emoji = "\xF0\x9F\x99\x82"; 940+ /* Stray trailing byte of a UTF-8 character. */ 941+ const char *stray = "\xBF"; 942+ /* U+10FFFF. */ 943+ const char *max_codepoint = "\xF4\x8F\xBF\xBF"; 944+ 945+ /* No escaping. */ 946+ { 947+ test_diagnostic_context dc; 948+ char_display_policy policy (make_policy (dc, richloc)); 949+ ASSERT_EQ (cpp_display_width (pi, strlen (pi), policy), 1); 950+ ASSERT_EQ (cpp_display_width (emoji, strlen (emoji), policy), 2); 951+ ASSERT_EQ (cpp_display_width (stray, strlen (stray), policy), 1); 952+ /* Don't check width of U+10FFFF; it's in a private use plane. */ 953+ } 954+ 955+ richloc.set_escape_on_output (true); 956+ 957+ { 958+ test_diagnostic_context dc; 959+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE; 960+ char_display_policy policy (make_policy (dc, richloc)); 961+ ASSERT_EQ (cpp_display_width (pi, strlen (pi), policy), 8); 962+ ASSERT_EQ (cpp_display_width (emoji, strlen (emoji), policy), 9); 963+ ASSERT_EQ (cpp_display_width (stray, strlen (stray), policy), 4); 964+ ASSERT_EQ (cpp_display_width (max_codepoint, strlen (max_codepoint), 965+ policy), 966+ strlen ("<U+10FFFF>")); 967+ } 968+ 969+ { 970+ test_diagnostic_context dc; 971+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES; 972+ char_display_policy policy (make_policy (dc, richloc)); 973+ ASSERT_EQ (cpp_display_width (pi, strlen (pi), policy), 8); 974+ ASSERT_EQ (cpp_display_width (emoji, strlen (emoji), policy), 16); 975+ ASSERT_EQ (cpp_display_width (stray, strlen (stray), policy), 4); 976+ ASSERT_EQ (cpp_display_width (max_codepoint, strlen (max_codepoint), 977+ policy), 978+ 16); 979+ } 980+} 981+ 982 /* For precise tests of the layout, make clear where the source line will 983 start. test_left_margin sets the total byte count from the left side of the 984 screen to the start of source lines, after the line number and the separator, 985@@ -2715,10 +2977,10 @@ test_layout_x_offset_display_utf8 (const 986 char_span lspan = location_get_source_line (tmp.get_filename (), 1); 987 ASSERT_EQ (line_display_cols, 988 cpp_display_width (lspan.get_buffer (), lspan.length (), 989- def_tabstop)); 990+ def_policy ())); 991 ASSERT_EQ (line_display_cols, 992 location_compute_display_column (expand_location (line_end), 993- def_tabstop)); 994+ def_policy ())); 995 ASSERT_EQ (0, memcmp (lspan.get_buffer () + (emoji_col - 1), 996 "\xf0\x9f\x98\x82\xf0\x9f\x98\x82", 8)); 997 998@@ -2866,12 +3128,13 @@ test_layout_x_offset_display_tab (const 999 ASSERT_EQ ('\t', *(lspan.get_buffer () + (tab_col - 1))); 1000 for (int tabstop = 1; tabstop != num_tabstops; ++tabstop) 1001 { 1002+ cpp_char_column_policy policy (tabstop, cpp_wcwidth); 1003 ASSERT_EQ (line_bytes + extra_width[tabstop], 1004 cpp_display_width (lspan.get_buffer (), lspan.length (), 1005- tabstop)); 1006+ policy)); 1007 ASSERT_EQ (line_bytes + extra_width[tabstop], 1008 location_compute_display_column (expand_location (line_end), 1009- tabstop)); 1010+ policy)); 1011 } 1012 1013 /* Check that the tab is expanded to the expected number of spaces. */ 1014@@ -4003,6 +4266,43 @@ test_one_liner_labels_utf8 () 1015 " bb\xf0\x9f\x98\x82\xf0\x9f\x98\x82\n", 1016 pp_formatted_text (dc.printer)); 1017 } 1018+ 1019+ /* Example of escaping the source lines. */ 1020+ { 1021+ text_range_label label0 ("label 0\xf0\x9f\x98\x82"); 1022+ text_range_label label1 ("label 1\xcf\x80"); 1023+ text_range_label label2 ("label 2\xcf\x80"); 1024+ gcc_rich_location richloc (foo, &label0); 1025+ richloc.add_range (bar, SHOW_RANGE_WITHOUT_CARET, &label1); 1026+ richloc.add_range (field, SHOW_RANGE_WITHOUT_CARET, &label2); 1027+ richloc.set_escape_on_output (true); 1028+ 1029+ { 1030+ test_diagnostic_context dc; 1031+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE; 1032+ diagnostic_show_locus (&dc, &richloc, DK_ERROR); 1033+ ASSERT_STREQ (" <U+1F602>_foo = <U+03C0>_bar.<U+1F602>_field<U+03C0>;\n" 1034+ " ^~~~~~~~~~~~~ ~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~\n" 1035+ " | | |\n" 1036+ " | | label 2\xcf\x80\n" 1037+ " | label 1\xcf\x80\n" 1038+ " label 0\xf0\x9f\x98\x82\n", 1039+ pp_formatted_text (dc.printer)); 1040+ } 1041+ { 1042+ test_diagnostic_context dc; 1043+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES; 1044+ diagnostic_show_locus (&dc, &richloc, DK_ERROR); 1045+ ASSERT_STREQ 1046+ (" <f0><9f><98><82>_foo = <cf><80>_bar.<f0><9f><98><82>_field<cf><80>;\n" 1047+ " ^~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" 1048+ " | | |\n" 1049+ " | | label 2\xcf\x80\n" 1050+ " | label 1\xcf\x80\n" 1051+ " label 0\xf0\x9f\x98\x82\n", 1052+ pp_formatted_text (dc.printer)); 1053+ } 1054+ } 1055 } 1056 1057 /* Make sure that colorization codes don't interrupt a multibyte 1058@@ -4057,9 +4357,9 @@ test_diagnostic_show_locus_one_liner_utf 1059 1060 char_span lspan = location_get_source_line (tmp.get_filename (), 1); 1061 ASSERT_EQ (25, cpp_display_width (lspan.get_buffer (), lspan.length (), 1062- def_tabstop)); 1063+ def_policy ())); 1064 ASSERT_EQ (25, location_compute_display_column (expand_location (line_end), 1065- def_tabstop)); 1066+ def_policy ())); 1067 1068 test_one_liner_simple_caret_utf8 (); 1069 test_one_liner_caret_and_range_utf8 (); 1070@@ -4445,30 +4745,31 @@ test_overlapped_fixit_printing (const li 1071 pp_formatted_text (dc.printer)); 1072 1073 /* Unit-test the line_corrections machinery. */ 1074+ char_display_policy policy (make_policy (dc, richloc)); 1075 ASSERT_EQ (3, richloc.get_num_fixit_hints ()); 1076 const fixit_hint *hint_0 = richloc.get_fixit_hint (0); 1077 ASSERT_EQ (column_range (12, 12), 1078- get_affected_range (&dc, hint_0, CU_BYTES)); 1079+ get_affected_range (policy, hint_0, CU_BYTES)); 1080 ASSERT_EQ (column_range (12, 12), 1081- get_affected_range (&dc, hint_0, CU_DISPLAY_COLS)); 1082- ASSERT_EQ (column_range (12, 22), get_printed_columns (&dc, hint_0)); 1083+ get_affected_range (policy, hint_0, CU_DISPLAY_COLS)); 1084+ ASSERT_EQ (column_range (12, 22), get_printed_columns (policy, hint_0)); 1085 const fixit_hint *hint_1 = richloc.get_fixit_hint (1); 1086 ASSERT_EQ (column_range (18, 18), 1087- get_affected_range (&dc, hint_1, CU_BYTES)); 1088+ get_affected_range (policy, hint_1, CU_BYTES)); 1089 ASSERT_EQ (column_range (18, 18), 1090- get_affected_range (&dc, hint_1, CU_DISPLAY_COLS)); 1091- ASSERT_EQ (column_range (18, 20), get_printed_columns (&dc, hint_1)); 1092+ get_affected_range (policy, hint_1, CU_DISPLAY_COLS)); 1093+ ASSERT_EQ (column_range (18, 20), get_printed_columns (policy, hint_1)); 1094 const fixit_hint *hint_2 = richloc.get_fixit_hint (2); 1095 ASSERT_EQ (column_range (29, 28), 1096- get_affected_range (&dc, hint_2, CU_BYTES)); 1097+ get_affected_range (policy, hint_2, CU_BYTES)); 1098 ASSERT_EQ (column_range (29, 28), 1099- get_affected_range (&dc, hint_2, CU_DISPLAY_COLS)); 1100- ASSERT_EQ (column_range (29, 29), get_printed_columns (&dc, hint_2)); 1101+ get_affected_range (policy, hint_2, CU_DISPLAY_COLS)); 1102+ ASSERT_EQ (column_range (29, 29), get_printed_columns (policy, hint_2)); 1103 1104 /* Add each hint in turn to a line_corrections instance, 1105 and verify that they are consolidated into one correction instance 1106 as expected. */ 1107- line_corrections lc (&dc, tmp.get_filename (), 1); 1108+ line_corrections lc (policy, tmp.get_filename (), 1); 1109 1110 /* The first replace hint by itself. */ 1111 lc.add_hint (hint_0); 1112@@ -4660,30 +4961,31 @@ test_overlapped_fixit_printing_utf8 (con 1113 pp_formatted_text (dc.printer)); 1114 1115 /* Unit-test the line_corrections machinery. */ 1116+ char_display_policy policy (make_policy (dc, richloc)); 1117 ASSERT_EQ (3, richloc.get_num_fixit_hints ()); 1118 const fixit_hint *hint_0 = richloc.get_fixit_hint (0); 1119 ASSERT_EQ (column_range (14, 14), 1120- get_affected_range (&dc, hint_0, CU_BYTES)); 1121+ get_affected_range (policy, hint_0, CU_BYTES)); 1122 ASSERT_EQ (column_range (12, 12), 1123- get_affected_range (&dc, hint_0, CU_DISPLAY_COLS)); 1124- ASSERT_EQ (column_range (12, 22), get_printed_columns (&dc, hint_0)); 1125+ get_affected_range (policy, hint_0, CU_DISPLAY_COLS)); 1126+ ASSERT_EQ (column_range (12, 22), get_printed_columns (policy, hint_0)); 1127 const fixit_hint *hint_1 = richloc.get_fixit_hint (1); 1128 ASSERT_EQ (column_range (22, 22), 1129- get_affected_range (&dc, hint_1, CU_BYTES)); 1130+ get_affected_range (policy, hint_1, CU_BYTES)); 1131 ASSERT_EQ (column_range (18, 18), 1132- get_affected_range (&dc, hint_1, CU_DISPLAY_COLS)); 1133- ASSERT_EQ (column_range (18, 20), get_printed_columns (&dc, hint_1)); 1134+ get_affected_range (policy, hint_1, CU_DISPLAY_COLS)); 1135+ ASSERT_EQ (column_range (18, 20), get_printed_columns (policy, hint_1)); 1136 const fixit_hint *hint_2 = richloc.get_fixit_hint (2); 1137 ASSERT_EQ (column_range (35, 34), 1138- get_affected_range (&dc, hint_2, CU_BYTES)); 1139+ get_affected_range (policy, hint_2, CU_BYTES)); 1140 ASSERT_EQ (column_range (30, 29), 1141- get_affected_range (&dc, hint_2, CU_DISPLAY_COLS)); 1142- ASSERT_EQ (column_range (30, 30), get_printed_columns (&dc, hint_2)); 1143+ get_affected_range (policy, hint_2, CU_DISPLAY_COLS)); 1144+ ASSERT_EQ (column_range (30, 30), get_printed_columns (policy, hint_2)); 1145 1146 /* Add each hint in turn to a line_corrections instance, 1147 and verify that they are consolidated into one correction instance 1148 as expected. */ 1149- line_corrections lc (&dc, tmp.get_filename (), 1); 1150+ line_corrections lc (policy, tmp.get_filename (), 1); 1151 1152 /* The first replace hint by itself. */ 1153 lc.add_hint (hint_0); 1154@@ -4877,15 +5179,16 @@ test_overlapped_fixit_printing_2 (const 1155 richloc.add_fixit_insert_before (col_21, "}"); 1156 1157 /* These fixits should be accepted; they can't be consolidated. */ 1158+ char_display_policy policy (make_policy (dc, richloc)); 1159 ASSERT_EQ (2, richloc.get_num_fixit_hints ()); 1160 const fixit_hint *hint_0 = richloc.get_fixit_hint (0); 1161 ASSERT_EQ (column_range (23, 22), 1162- get_affected_range (&dc, hint_0, CU_BYTES)); 1163- ASSERT_EQ (column_range (23, 23), get_printed_columns (&dc, hint_0)); 1164+ get_affected_range (policy, hint_0, CU_BYTES)); 1165+ ASSERT_EQ (column_range (23, 23), get_printed_columns (policy, hint_0)); 1166 const fixit_hint *hint_1 = richloc.get_fixit_hint (1); 1167 ASSERT_EQ (column_range (21, 20), 1168- get_affected_range (&dc, hint_1, CU_BYTES)); 1169- ASSERT_EQ (column_range (21, 21), get_printed_columns (&dc, hint_1)); 1170+ get_affected_range (policy, hint_1, CU_BYTES)); 1171+ ASSERT_EQ (column_range (21, 21), get_printed_columns (policy, hint_1)); 1172 1173 /* Verify that they're printed correctly. */ 1174 diagnostic_show_locus (&dc, &richloc, DK_ERROR); 1175@@ -5152,10 +5455,11 @@ test_tab_expansion (const line_table_cas 1176 ....................123 45678901234 56789012345 columns */ 1177 1178 const int tabstop = 8; 1179+ cpp_char_column_policy policy (tabstop, cpp_wcwidth); 1180 const int first_non_ws_byte_col = 7; 1181 const int right_quote_byte_col = 15; 1182 const int last_byte_col = 25; 1183- ASSERT_EQ (35, cpp_display_width (content, last_byte_col, tabstop)); 1184+ ASSERT_EQ (35, cpp_display_width (content, last_byte_col, policy)); 1185 1186 temp_source_file tmp (SELFTEST_LOCATION, ".c", content); 1187 line_table_test ltt (case_); 1188@@ -5198,6 +5502,114 @@ test_tab_expansion (const line_table_cas 1189 } 1190 } 1191 1192+/* Verify that the escaping machinery can cope with a variety of different 1193+ invalid bytes. */ 1194+ 1195+static void 1196+test_escaping_bytes_1 (const line_table_case &case_) 1197+{ 1198+ const char content[] = "before\0\1\2\3\r\x80\xff""after\n"; 1199+ const size_t sz = sizeof (content); 1200+ temp_source_file tmp (SELFTEST_LOCATION, ".c", content, sz); 1201+ line_table_test ltt (case_); 1202+ const line_map_ordinary *ord_map = linemap_check_ordinary 1203+ (linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 0)); 1204+ linemap_line_start (line_table, 1, 100); 1205+ 1206+ location_t finish 1207+ = linemap_position_for_line_and_column (line_table, ord_map, 1, 1208+ strlen (content)); 1209+ 1210+ if (finish > LINE_MAP_MAX_LOCATION_WITH_COLS) 1211+ return; 1212+ 1213+ /* Locations of the NUL and \r bytes. */ 1214+ location_t nul_loc 1215+ = linemap_position_for_line_and_column (line_table, ord_map, 1, 7); 1216+ location_t r_loc 1217+ = linemap_position_for_line_and_column (line_table, ord_map, 1, 11); 1218+ gcc_rich_location richloc (nul_loc); 1219+ richloc.add_range (r_loc); 1220+ 1221+ { 1222+ test_diagnostic_context dc; 1223+ diagnostic_show_locus (&dc, &richloc, DK_ERROR); 1224+ ASSERT_STREQ (" before \1\2\3 \x80\xff""after\n" 1225+ " ^ ~\n", 1226+ pp_formatted_text (dc.printer)); 1227+ } 1228+ richloc.set_escape_on_output (true); 1229+ { 1230+ test_diagnostic_context dc; 1231+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE; 1232+ diagnostic_show_locus (&dc, &richloc, DK_ERROR); 1233+ ASSERT_STREQ 1234+ (" before<U+0000><U+0001><U+0002><U+0003><U+000D><80><ff>after\n" 1235+ " ^~~~~~~~ ~~~~~~~~\n", 1236+ pp_formatted_text (dc.printer)); 1237+ } 1238+ { 1239+ test_diagnostic_context dc; 1240+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES; 1241+ diagnostic_show_locus (&dc, &richloc, DK_ERROR); 1242+ ASSERT_STREQ (" before<00><01><02><03><0d><80><ff>after\n" 1243+ " ^~~~ ~~~~\n", 1244+ pp_formatted_text (dc.printer)); 1245+ } 1246+} 1247+ 1248+/* As above, but verify that we handle the initial byte of a line 1249+ correctly. */ 1250+ 1251+static void 1252+test_escaping_bytes_2 (const line_table_case &case_) 1253+{ 1254+ const char content[] = "\0after\n"; 1255+ const size_t sz = sizeof (content); 1256+ temp_source_file tmp (SELFTEST_LOCATION, ".c", content, sz); 1257+ line_table_test ltt (case_); 1258+ const line_map_ordinary *ord_map = linemap_check_ordinary 1259+ (linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 0)); 1260+ linemap_line_start (line_table, 1, 100); 1261+ 1262+ location_t finish 1263+ = linemap_position_for_line_and_column (line_table, ord_map, 1, 1264+ strlen (content)); 1265+ 1266+ if (finish > LINE_MAP_MAX_LOCATION_WITH_COLS) 1267+ return; 1268+ 1269+ /* Location of the NUL byte. */ 1270+ location_t nul_loc 1271+ = linemap_position_for_line_and_column (line_table, ord_map, 1, 1); 1272+ gcc_rich_location richloc (nul_loc); 1273+ 1274+ { 1275+ test_diagnostic_context dc; 1276+ diagnostic_show_locus (&dc, &richloc, DK_ERROR); 1277+ ASSERT_STREQ (" after\n" 1278+ " ^\n", 1279+ pp_formatted_text (dc.printer)); 1280+ } 1281+ richloc.set_escape_on_output (true); 1282+ { 1283+ test_diagnostic_context dc; 1284+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE; 1285+ diagnostic_show_locus (&dc, &richloc, DK_ERROR); 1286+ ASSERT_STREQ (" <U+0000>after\n" 1287+ " ^~~~~~~~\n", 1288+ pp_formatted_text (dc.printer)); 1289+ } 1290+ { 1291+ test_diagnostic_context dc; 1292+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES; 1293+ diagnostic_show_locus (&dc, &richloc, DK_ERROR); 1294+ ASSERT_STREQ (" <00>after\n" 1295+ " ^~~~\n", 1296+ pp_formatted_text (dc.printer)); 1297+ } 1298+} 1299+ 1300 /* Verify that line numbers are correctly printed for the case of 1301 a multiline range in which the width of the line numbers changes 1302 (e.g. from "9" to "10"). */ 1303@@ -5254,6 +5666,8 @@ diagnostic_show_locus_c_tests () 1304 test_layout_range_for_single_line (); 1305 test_layout_range_for_multiple_lines (); 1306 1307+ test_display_widths (); 1308+ 1309 for_each_line_table_case (test_layout_x_offset_display_utf8); 1310 for_each_line_table_case (test_layout_x_offset_display_tab); 1311 1312@@ -5274,6 +5688,8 @@ diagnostic_show_locus_c_tests () 1313 for_each_line_table_case (test_fixit_replace_containing_newline); 1314 for_each_line_table_case (test_fixit_deletion_affecting_newline); 1315 for_each_line_table_case (test_tab_expansion); 1316+ for_each_line_table_case (test_escaping_bytes_1); 1317+ for_each_line_table_case (test_escaping_bytes_2); 1318 1319 test_line_numbers_multiline_range (); 1320 } 1321diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi 1322--- a/gcc/doc/invoke.texi 2021-12-13 23:23:05.764437151 -0800 1323+++ b/gcc/doc/invoke.texi 2021-12-14 01:16:01.553943061 -0800 1324@@ -312,7 +312,8 @@ Objective-C and Objective-C++ Dialects}. 1325 -fdiagnostics-show-path-depths @gol 1326 -fno-show-column @gol 1327 -fdiagnostics-column-unit=@r{[}display@r{|}byte@r{]} @gol 1328--fdiagnostics-column-origin=@var{origin}} 1329+-fdiagnostics-column-origin=@var{origin} @gol 1330+-fdiagnostics-escape-format=@r{[}unicode@r{|}bytes@r{]}} 1331 1332 @item Warning Options 1333 @xref{Warning Options,,Options to Request or Suppress Warnings}. 1334@@ -5083,6 +5084,38 @@ first column. The default value of 1 co 1335 behavior and to the GNU style guide. Some utilities may perform better with an 1336 origin of 0; any non-negative value may be specified. 1337 1338+@item -fdiagnostics-escape-format=@var{FORMAT} 1339+@opindex fdiagnostics-escape-format 1340+When GCC prints pertinent source lines for a diagnostic it normally attempts 1341+to print the source bytes directly. However, some diagnostics relate to encoding 1342+issues in the source file, such as malformed UTF-8, or issues with Unicode 1343+normalization. These diagnostics are flagged so that GCC will escape bytes 1344+that are not printable ASCII when printing their pertinent source lines. 1345+ 1346+This option controls how such bytes should be escaped. 1347+ 1348+The default @var{FORMAT}, @samp{unicode} displays Unicode characters that 1349+are not printable ASCII in the form @samp{<U+XXXX>}, and bytes that do not 1350+correspond to a Unicode character validly-encoded in UTF-8-encoded will be 1351+displayed as hexadecimal in the form @samp{<XX>}. 1352+ 1353+For example, a source line containing the string @samp{before} followed by the 1354+Unicode character U+03C0 (``GREEK SMALL LETTER PI'', with UTF-8 encoding 1355+0xCF 0x80) followed by the byte 0xBF (a stray UTF-8 trailing byte), followed by 1356+the string @samp{after} will be printed for such a diagnostic as: 1357+ 1358+@smallexample 1359+ before<U+03C0><BF>after 1360+@end smallexample 1361+ 1362+Setting @var{FORMAT} to @samp{bytes} will display all non-printable-ASCII bytes 1363+in the form @samp{<XX>}, thus showing the underlying encoding of non-ASCII 1364+Unicode characters. For the example above, the following will be printed: 1365+ 1366+@smallexample 1367+ before<CF><80><BF>after 1368+@end smallexample 1369+ 1370 @item -fdiagnostics-format=@var{FORMAT} 1371 @opindex fdiagnostics-format 1372 Select a different format for printing diagnostics. 1373@@ -5150,9 +5183,11 @@ might be printed in JSON form (after for 1374 @} 1375 @} 1376 ], 1377+ "escape-source": false, 1378 "message": "...this statement, but the latter is @dots{}" 1379 @} 1380 ] 1381+ "escape-source": false, 1382 "column-origin": 1, 1383 @}, 1384 @dots{} 1385@@ -5239,6 +5274,7 @@ of the expression, which have labels. I 1386 "label": "T @{aka struct t@}" 1387 @} 1388 ], 1389+ "escape-source": false, 1390 "message": "invalid operands to binary + @dots{}" 1391 @} 1392 @end smallexample 1393@@ -5292,6 +5328,7 @@ might be printed in JSON form as: 1394 @} 1395 @} 1396 ], 1397+ "escape-source": false, 1398 "message": "\u2018struct s\u2019 has no member named @dots{}" 1399 @} 1400 @end smallexample 1401@@ -5349,6 +5386,10 @@ For example, the intraprocedural example 1402 ] 1403 @end smallexample 1404 1405+Diagnostics have a boolean attribute @code{escape-source}, hinting whether 1406+non-ASCII bytes should be escaped when printing the pertinent lines of 1407+source code (@code{true} for diagnostics involving source encoding issues). 1408+ 1409 @end table 1410 1411 @node Warning Options 1412diff --git a/gcc/input.c b/gcc/input.c 1413--- a/gcc/input.c 2021-07-27 23:55:07.328287915 -0700 1414+++ b/gcc/input.c 2021-12-14 01:16:01.553943061 -0800 1415@@ -913,7 +913,8 @@ make_location (location_t caret, source_ 1416 source line in order to calculate the display width. If that cannot be done 1417 for any reason, then returns the byte column as a fallback. */ 1418 int 1419-location_compute_display_column (expanded_location exploc, int tabstop) 1420+location_compute_display_column (expanded_location exploc, 1421+ const cpp_char_column_policy &policy) 1422 { 1423 if (!(exploc.file && *exploc.file && exploc.line && exploc.column)) 1424 return exploc.column; 1425@@ -921,7 +922,7 @@ location_compute_display_column (expande 1426 /* If line is NULL, this function returns exploc.column which is the 1427 desired fallback. */ 1428 return cpp_byte_column_to_display_column (line.get_buffer (), line.length (), 1429- exploc.column, tabstop); 1430+ exploc.column, policy); 1431 } 1432 1433 /* Dump statistics to stderr about the memory usage of the line_table 1434@@ -3611,43 +3612,50 @@ test_line_offset_overflow () 1435 void test_cpp_utf8 () 1436 { 1437 const int def_tabstop = 8; 1438+ cpp_char_column_policy policy (def_tabstop, cpp_wcwidth); 1439+ 1440 /* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */ 1441 { 1442- int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, def_tabstop); 1443+ int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy); 1444 ASSERT_EQ (8, w_bad); 1445- int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, def_tabstop); 1446+ int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy); 1447 ASSERT_EQ (5, w_ctrl); 1448 } 1449 1450 /* Verify that wcwidth of valid UTF-8 is as expected. */ 1451 { 1452- const int w_pi = cpp_display_width ("\xcf\x80", 2, def_tabstop); 1453+ const int w_pi = cpp_display_width ("\xcf\x80", 2, policy); 1454 ASSERT_EQ (1, w_pi); 1455- const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, def_tabstop); 1456+ const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy); 1457 ASSERT_EQ (2, w_emoji); 1458 const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2, 1459- def_tabstop); 1460+ policy); 1461 ASSERT_EQ (1, w_umlaut_precomposed); 1462 const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3, 1463- def_tabstop); 1464+ policy); 1465 ASSERT_EQ (1, w_umlaut_combining); 1466- const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, def_tabstop); 1467+ const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy); 1468 ASSERT_EQ (2, w_han); 1469- const int w_ascii = cpp_display_width ("GCC", 3, def_tabstop); 1470+ const int w_ascii = cpp_display_width ("GCC", 3, policy); 1471 ASSERT_EQ (3, w_ascii); 1472 const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82" 1473 "\x9f! \xe4\xb8\xba y\xcc\x88", 1474- 24, def_tabstop); 1475+ 24, policy); 1476 ASSERT_EQ (18, w_mixed); 1477 } 1478 1479 /* Verify that display width properly expands tabs. */ 1480 { 1481 const char *tstr = "\tabc\td"; 1482- ASSERT_EQ (6, cpp_display_width (tstr, 6, 1)); 1483- ASSERT_EQ (10, cpp_display_width (tstr, 6, 3)); 1484- ASSERT_EQ (17, cpp_display_width (tstr, 6, 8)); 1485- ASSERT_EQ (1, cpp_display_column_to_byte_column (tstr, 6, 7, 8)); 1486+ ASSERT_EQ (6, cpp_display_width (tstr, 6, 1487+ cpp_char_column_policy (1, cpp_wcwidth))); 1488+ ASSERT_EQ (10, cpp_display_width (tstr, 6, 1489+ cpp_char_column_policy (3, cpp_wcwidth))); 1490+ ASSERT_EQ (17, cpp_display_width (tstr, 6, 1491+ cpp_char_column_policy (8, cpp_wcwidth))); 1492+ ASSERT_EQ (1, 1493+ cpp_display_column_to_byte_column 1494+ (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth))); 1495 } 1496 1497 /* Verify that cpp_byte_column_to_display_column can go past the end, 1498@@ -3660,13 +3668,13 @@ void test_cpp_utf8 () 1499 /* 111122223456 1500 Byte columns. */ 1501 1502- ASSERT_EQ (5, cpp_display_width (str, 6, def_tabstop)); 1503+ ASSERT_EQ (5, cpp_display_width (str, 6, policy)); 1504 ASSERT_EQ (105, 1505- cpp_byte_column_to_display_column (str, 6, 106, def_tabstop)); 1506+ cpp_byte_column_to_display_column (str, 6, 106, policy)); 1507 ASSERT_EQ (10000, 1508- cpp_byte_column_to_display_column (NULL, 0, 10000, def_tabstop)); 1509+ cpp_byte_column_to_display_column (NULL, 0, 10000, policy)); 1510 ASSERT_EQ (0, 1511- cpp_byte_column_to_display_column (NULL, 10000, 0, def_tabstop)); 1512+ cpp_byte_column_to_display_column (NULL, 10000, 0, policy)); 1513 } 1514 1515 /* Verify that cpp_display_column_to_byte_column can go past the end, 1516@@ -3680,25 +3688,25 @@ void test_cpp_utf8 () 1517 /* 000000000000000000000000000000000111111 1518 111122223333444456666777788889999012345 1519 Byte columns. */ 1520- ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, def_tabstop)); 1521+ ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy)); 1522 ASSERT_EQ (15, 1523- cpp_display_column_to_byte_column (str, 15, 11, def_tabstop)); 1524+ cpp_display_column_to_byte_column (str, 15, 11, policy)); 1525 ASSERT_EQ (115, 1526- cpp_display_column_to_byte_column (str, 15, 111, def_tabstop)); 1527+ cpp_display_column_to_byte_column (str, 15, 111, policy)); 1528 ASSERT_EQ (10000, 1529- cpp_display_column_to_byte_column (NULL, 0, 10000, def_tabstop)); 1530+ cpp_display_column_to_byte_column (NULL, 0, 10000, policy)); 1531 ASSERT_EQ (0, 1532- cpp_display_column_to_byte_column (NULL, 10000, 0, def_tabstop)); 1533+ cpp_display_column_to_byte_column (NULL, 10000, 0, policy)); 1534 1535 /* Verify that we do not interrupt a UTF-8 sequence. */ 1536- ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, def_tabstop)); 1537+ ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy)); 1538 1539 for (int byte_col = 1; byte_col <= 15; ++byte_col) 1540 { 1541 const int disp_col 1542- = cpp_byte_column_to_display_column (str, 15, byte_col, def_tabstop); 1543+ = cpp_byte_column_to_display_column (str, 15, byte_col, policy); 1544 const int byte_col2 1545- = cpp_display_column_to_byte_column (str, 15, disp_col, def_tabstop); 1546+ = cpp_display_column_to_byte_column (str, 15, disp_col, policy); 1547 1548 /* If we ask for the display column in the middle of a UTF-8 1549 sequence, it will return the length of the partial sequence, 1550diff --git a/gcc/input.h b/gcc/input.h 1551--- a/gcc/input.h 2021-07-27 23:55:07.328287915 -0700 1552+++ b/gcc/input.h 2021-12-14 01:16:01.553943061 -0800 1553@@ -39,8 +39,11 @@ STATIC_ASSERT (BUILTINS_LOCATION < RESER 1554 extern bool is_location_from_builtin_token (location_t); 1555 extern expanded_location expand_location (location_t); 1556 1557-extern int location_compute_display_column (expanded_location exploc, 1558- int tabstop); 1559+class cpp_char_column_policy; 1560+ 1561+extern int 1562+location_compute_display_column (expanded_location exploc, 1563+ const cpp_char_column_policy &policy); 1564 1565 /* A class capturing the bounds of a buffer, to allow for run-time 1566 bounds-checking in a checked build. */ 1567diff --git a/gcc/opts.c b/gcc/opts.c 1568--- a/gcc/opts.c 2021-07-27 23:55:07.364288417 -0700 1569+++ b/gcc/opts.c 2021-12-14 01:16:01.553943061 -0800 1570@@ -2573,6 +2573,10 @@ common_handle_option (struct gcc_options 1571 dc->column_origin = value; 1572 break; 1573 1574+ case OPT_fdiagnostics_escape_format_: 1575+ dc->escape_format = (enum diagnostics_escape_format)value; 1576+ break; 1577+ 1578 case OPT_fdiagnostics_show_cwe: 1579 dc->show_cwe = value; 1580 break; 1581diff --git a/gcc/selftest.c b/gcc/selftest.c 1582--- a/gcc/selftest.c 2021-07-27 23:55:07.500290315 -0700 1583+++ b/gcc/selftest.c 2021-12-14 01:16:01.557942991 -0800 1584@@ -193,6 +193,21 @@ temp_source_file::temp_source_file (cons 1585 fclose (out); 1586 } 1587 1588+/* As above, but with a size, to allow for NUL bytes in CONTENT. */ 1589+ 1590+temp_source_file::temp_source_file (const location &loc, 1591+ const char *suffix, 1592+ const char *content, 1593+ size_t sz) 1594+: named_temp_file (suffix) 1595+{ 1596+ FILE *out = fopen (get_filename (), "w"); 1597+ if (!out) 1598+ fail_formatted (loc, "unable to open tempfile: %s", get_filename ()); 1599+ fwrite (content, sz, 1, out); 1600+ fclose (out); 1601+} 1602+ 1603 /* Avoid introducing locale-specific differences in the results 1604 by hardcoding open_quote and close_quote. */ 1605 1606diff --git a/gcc/selftest.h b/gcc/selftest.h 1607--- a/gcc/selftest.h 2021-07-27 23:55:07.500290315 -0700 1608+++ b/gcc/selftest.h 2021-12-14 01:16:01.557942991 -0800 1609@@ -112,6 +112,8 @@ class temp_source_file : public named_te 1610 public: 1611 temp_source_file (const location &loc, const char *suffix, 1612 const char *content); 1613+ temp_source_file (const location &loc, const char *suffix, 1614+ const char *content, size_t sz); 1615 }; 1616 1617 /* RAII-style class for avoiding introducing locale-specific differences 1618diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c 1619--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c 2021-07-27 23:55:07.596291654 -0700 1620+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c 2021-12-14 01:16:01.557942991 -0800 1621@@ -9,6 +9,7 @@ 1622 1623 /* { dg-regexp "\"kind\": \"error\"" } */ 1624 /* { dg-regexp "\"column-origin\": 1" } */ 1625+/* { dg-regexp "\"escape-source\": false" } */ 1626 /* { dg-regexp "\"message\": \"#error message\"" } */ 1627 1628 /* { dg-regexp "\"caret\": \{" } */ 1629diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c 1630--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c 2021-07-27 23:55:07.596291654 -0700 1631+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c 2021-12-14 01:16:01.557942991 -0800 1632@@ -9,6 +9,7 @@ 1633 1634 /* { dg-regexp "\"kind\": \"warning\"" } */ 1635 /* { dg-regexp "\"column-origin\": 1" } */ 1636+/* { dg-regexp "\"escape-source\": false" } */ 1637 /* { dg-regexp "\"message\": \"#warning message\"" } */ 1638 /* { dg-regexp "\"option\": \"-Wcpp\"" } */ 1639 /* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]*#index-Wcpp\"" } */ 1640diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c 1641--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c 2021-07-27 23:55:07.596291654 -0700 1642+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c 2021-12-14 01:16:01.557942991 -0800 1643@@ -9,6 +9,7 @@ 1644 1645 /* { dg-regexp "\"kind\": \"error\"" } */ 1646 /* { dg-regexp "\"column-origin\": 1" } */ 1647+/* { dg-regexp "\"escape-source\": false" } */ 1648 /* { dg-regexp "\"message\": \"#warning message\"" } */ 1649 /* { dg-regexp "\"option\": \"-Werror=cpp\"" } */ 1650 /* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]*#index-Wcpp\"" } */ 1651diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c 1652--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c 2021-07-27 23:55:07.596291654 -0700 1653+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c 2021-12-14 01:16:01.557942991 -0800 1654@@ -19,6 +19,7 @@ int test (void) 1655 1656 /* { dg-regexp "\"kind\": \"note\"" } */ 1657 /* { dg-regexp "\"message\": \"...this statement, but the latter is misleadingly indented as if it were guarded by the 'if'\"" } */ 1658+/* { dg-regexp "\"escape-source\": false" } */ 1659 1660 /* { dg-regexp "\"caret\": \{" } */ 1661 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-4.c\"" } */ 1662@@ -39,6 +40,7 @@ int test (void) 1663 /* { dg-regexp "\"kind\": \"warning\"" } */ 1664 /* { dg-regexp "\"column-origin\": 1" } */ 1665 /* { dg-regexp "\"message\": \"this 'if' clause does not guard...\"" } */ 1666+/* { dg-regexp "\"escape-source\": false" } */ 1667 /* { dg-regexp "\"option\": \"-Wmisleading-indentation\"" } */ 1668 /* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]*#index-Wmisleading-indentation\"" } */ 1669 1670diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c 1671--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c 2021-07-27 23:55:07.596291654 -0700 1672+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c 2021-12-14 01:16:01.557942991 -0800 1673@@ -14,6 +14,7 @@ int test (struct s *ptr) 1674 1675 /* { dg-regexp "\"kind\": \"error\"" } */ 1676 /* { dg-regexp "\"column-origin\": 1" } */ 1677+/* { dg-regexp "\"escape-source\": false" } */ 1678 /* { dg-regexp "\"message\": \".*\"" } */ 1679 1680 /* Verify fix-it hints. */ 1681diff --git a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c 1682--- a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c 1969-12-31 16:00:00.000000000 -0800 1683+++ b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c 2021-12-14 01:16:01.557942991 -0800 1684@@ -0,0 +1,21 @@ 1685+// { dg-do preprocess } 1686+// { dg-options "-std=gnu99 -Werror=normalized=nfc -fdiagnostics-show-caret -fdiagnostics-escape-format=bytes" } 1687+/* { dg-message "some warnings being treated as errors" "" {target "*-*-*"} 0 } */ 1688+ 1689+/* འ= U+0F43 TIBETAN LETTER GHA, which has decomposition "0F42 0FB7" i.e. 1690+ U+0F42 TIBETAN LETTER GA: འ1691+ U+0FB7 TIBETAN SUBJOINED LETTER HA: ྷ 1692+ 1693+ The UTF-8 encoding of U+0F43 TIBETAN LETTER GHA is: E0 BD 83. */ 1694+ 1695+foo before_\u0F43_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." } 1696+/* { dg-begin-multiline-output "" } 1697+ foo before_\u0F43_after bar 1698+ ^~~~~~~~~~~~~~~~~~~ 1699+ { dg-end-multiline-output "" } */ 1700+ 1701+foo before_à½_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." } 1702+/* { dg-begin-multiline-output "" } 1703+ foo before_<e0><bd><83>_after bar 1704+ ^~~~~~~~~~~~~~~~~~~~~~~~~ 1705+ { dg-end-multiline-output "" } */ 1706diff --git a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c 1707--- a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c 1969-12-31 16:00:00.000000000 -0800 1708+++ b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c 2021-12-14 01:16:01.557942991 -0800 1709@@ -0,0 +1,19 @@ 1710+// { dg-do preprocess } 1711+// { dg-options "-std=gnu99 -Werror=normalized=nfc -fdiagnostics-show-caret -fdiagnostics-escape-format=unicode" } 1712+/* { dg-message "some warnings being treated as errors" "" {target "*-*-*"} 0 } */ 1713+ 1714+/* འ= U+0F43 TIBETAN LETTER GHA, which has decomposition "0F42 0FB7" i.e. 1715+ U+0F42 TIBETAN LETTER GA: འ1716+ U+0FB7 TIBETAN SUBJOINED LETTER HA: ྷ */ 1717+ 1718+foo before_\u0F43_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." } 1719+/* { dg-begin-multiline-output "" } 1720+ foo before_\u0F43_after bar 1721+ ^~~~~~~~~~~~~~~~~~~ 1722+ { dg-end-multiline-output "" } */ 1723+ 1724+foo before_à½_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." } 1725+/* { dg-begin-multiline-output "" } 1726+ foo before_<U+0F43>_after bar 1727+ ^~~~~~~~~~~~~~~~~~~~~ 1728+ { dg-end-multiline-output "" } */ 1729diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 1730--- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 2021-07-27 23:55:08.472303878 -0700 1731+++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 2021-12-14 01:16:01.557942991 -0800 1732@@ -9,6 +9,7 @@ 1733 1734 ! { dg-regexp "\"kind\": \"error\"" } 1735 ! { dg-regexp "\"column-origin\": 1" } 1736+! { dg-regexp "\"escape-source\": false" } 1737 ! { dg-regexp "\"message\": \"#error message\"" } 1738 1739 ! { dg-regexp "\"caret\": \{" } 1740diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 1741--- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 2021-07-27 23:55:08.472303878 -0700 1742+++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 2021-12-14 01:16:01.557942991 -0800 1743@@ -9,6 +9,7 @@ 1744 1745 ! { dg-regexp "\"kind\": \"warning\"" } 1746 ! { dg-regexp "\"column-origin\": 1" } 1747+! { dg-regexp "\"escape-source\": false" } 1748 ! { dg-regexp "\"message\": \"#warning message\"" } 1749 ! { dg-regexp "\"option\": \"-Wcpp\"" } 1750 ! { dg-regexp "\"option_url\": \"\[^\n\r\"\]*#index-Wcpp\"" } 1751diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 1752--- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 2021-07-27 23:55:08.472303878 -0700 1753+++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 2021-12-14 01:16:01.557942991 -0800 1754@@ -9,6 +9,7 @@ 1755 1756 ! { dg-regexp "\"kind\": \"error\"" } 1757 ! { dg-regexp "\"column-origin\": 1" } 1758+! { dg-regexp "\"escape-source\": false" } 1759 ! { dg-regexp "\"message\": \"#warning message\"" } 1760 ! { dg-regexp "\"option\": \"-Werror=cpp\"" } 1761 ! { dg-regexp "\"option_url\": \"\[^\n\r\"\]*#index-Wcpp\"" } 1762diff --git a/libcpp/charset.c b/libcpp/charset.c 1763--- a/libcpp/charset.c 2021-07-27 23:55:08.712307227 -0700 1764+++ b/libcpp/charset.c 2021-12-14 01:16:01.557942991 -0800 1765@@ -1552,12 +1552,14 @@ convert_escape (cpp_reader *pfile, const 1766 "unknown escape sequence: '\\%c'", (int) c); 1767 else 1768 { 1769+ encoding_rich_location rich_loc (pfile); 1770+ 1771 /* diagnostic.c does not support "%03o". When it does, this 1772 code can use %03o directly in the diagnostic again. */ 1773 char buf[32]; 1774 sprintf(buf, "%03o", (int) c); 1775- cpp_error (pfile, CPP_DL_PEDWARN, 1776- "unknown escape sequence: '\\%s'", buf); 1777+ cpp_error_at (pfile, CPP_DL_PEDWARN, &rich_loc, 1778+ "unknown escape sequence: '\\%s'", buf); 1779 } 1780 } 1781 1782@@ -2280,14 +2282,16 @@ cpp_string_location_reader::get_next () 1783 } 1784 1785 cpp_display_width_computation:: 1786-cpp_display_width_computation (const char *data, int data_length, int tabstop) : 1787+cpp_display_width_computation (const char *data, int data_length, 1788+ const cpp_char_column_policy &policy) : 1789 m_begin (data), 1790 m_next (m_begin), 1791 m_bytes_left (data_length), 1792- m_tabstop (tabstop), 1793+ m_policy (policy), 1794 m_display_cols (0) 1795 { 1796- gcc_assert (m_tabstop > 0); 1797+ gcc_assert (policy.m_tabstop > 0); 1798+ gcc_assert (policy.m_width_cb); 1799 } 1800 1801 1802@@ -2299,19 +2303,28 @@ cpp_display_width_computation (const cha 1803 point to a valid UTF-8-encoded sequence, then it will be treated as a single 1804 byte with display width 1. m_cur_display_col is the current display column, 1805 relative to which tab stops should be expanded. Returns the display width of 1806- the codepoint just processed. */ 1807+ the codepoint just processed. 1808+ If OUT is non-NULL, it is populated. */ 1809 1810 int 1811-cpp_display_width_computation::process_next_codepoint () 1812+cpp_display_width_computation::process_next_codepoint (cpp_decoded_char *out) 1813 { 1814 cppchar_t c; 1815 int next_width; 1816 1817+ if (out) 1818+ out->m_start_byte = m_next; 1819+ 1820 if (*m_next == '\t') 1821 { 1822 ++m_next; 1823 --m_bytes_left; 1824- next_width = m_tabstop - (m_display_cols % m_tabstop); 1825+ next_width = m_policy.m_tabstop - (m_display_cols % m_policy.m_tabstop); 1826+ if (out) 1827+ { 1828+ out->m_ch = '\t'; 1829+ out->m_valid_ch = true; 1830+ } 1831 } 1832 else if (one_utf8_to_cppchar ((const uchar **) &m_next, &m_bytes_left, &c) 1833 != 0) 1834@@ -2321,14 +2334,24 @@ cpp_display_width_computation::process_n 1835 of one. */ 1836 ++m_next; 1837 --m_bytes_left; 1838- next_width = 1; 1839+ next_width = m_policy.m_undecoded_byte_width; 1840+ if (out) 1841+ out->m_valid_ch = false; 1842 } 1843 else 1844 { 1845 /* one_utf8_to_cppchar() has updated m_next and m_bytes_left for us. */ 1846- next_width = cpp_wcwidth (c); 1847+ next_width = m_policy.m_width_cb (c); 1848+ if (out) 1849+ { 1850+ out->m_ch = c; 1851+ out->m_valid_ch = true; 1852+ } 1853 } 1854 1855+ if (out) 1856+ out->m_next_byte = m_next; 1857+ 1858 m_display_cols += next_width; 1859 return next_width; 1860 } 1861@@ -2344,7 +2367,7 @@ cpp_display_width_computation::advance_d 1862 const int start = m_display_cols; 1863 const int target = start + n; 1864 while (m_display_cols < target && !done ()) 1865- process_next_codepoint (); 1866+ process_next_codepoint (NULL); 1867 return m_display_cols - start; 1868 } 1869 1870@@ -2352,29 +2375,33 @@ cpp_display_width_computation::advance_d 1871 how many display columns are occupied by the first COLUMN bytes. COLUMN 1872 may exceed DATA_LENGTH, in which case the phantom bytes at the end are 1873 treated as if they have display width 1. Tabs are expanded to the next tab 1874- stop, relative to the start of DATA. */ 1875+ stop, relative to the start of DATA, and non-printable-ASCII characters 1876+ will be escaped as per POLICY. */ 1877 1878 int 1879 cpp_byte_column_to_display_column (const char *data, int data_length, 1880- int column, int tabstop) 1881+ int column, 1882+ const cpp_char_column_policy &policy) 1883 { 1884 const int offset = MAX (0, column - data_length); 1885- cpp_display_width_computation dw (data, column - offset, tabstop); 1886+ cpp_display_width_computation dw (data, column - offset, policy); 1887 while (!dw.done ()) 1888- dw.process_next_codepoint (); 1889+ dw.process_next_codepoint (NULL); 1890 return dw.display_cols_processed () + offset; 1891 } 1892 1893 /* For the string of length DATA_LENGTH bytes that begins at DATA, compute 1894 the least number of bytes that will result in at least DISPLAY_COL display 1895 columns. The return value may exceed DATA_LENGTH if the entire string does 1896- not occupy enough display columns. */ 1897+ not occupy enough display columns. Non-printable-ASCII characters 1898+ will be escaped as per POLICY. */ 1899 1900 int 1901 cpp_display_column_to_byte_column (const char *data, int data_length, 1902- int display_col, int tabstop) 1903+ int display_col, 1904+ const cpp_char_column_policy &policy) 1905 { 1906- cpp_display_width_computation dw (data, data_length, tabstop); 1907+ cpp_display_width_computation dw (data, data_length, policy); 1908 const int avail_display = dw.advance_display_cols (display_col); 1909 return dw.bytes_processed () + MAX (0, display_col - avail_display); 1910 } 1911diff --git a/libcpp/errors.c b/libcpp/errors.c 1912--- a/libcpp/errors.c 2021-07-27 23:55:08.712307227 -0700 1913+++ b/libcpp/errors.c 2021-12-14 01:16:01.557942991 -0800 1914@@ -27,6 +27,31 @@ along with this program; see the file CO 1915 #include "cpplib.h" 1916 #include "internal.h" 1917 1918+/* Get a location_t for the current location in PFILE, 1919+ generally that of the previously lexed token. */ 1920+ 1921+location_t 1922+cpp_diagnostic_get_current_location (cpp_reader *pfile) 1923+{ 1924+ if (CPP_OPTION (pfile, traditional)) 1925+ { 1926+ if (pfile->state.in_directive) 1927+ return pfile->directive_line; 1928+ else 1929+ return pfile->line_table->highest_line; 1930+ } 1931+ /* We don't want to refer to a token before the beginning of the 1932+ current run -- that is invalid. */ 1933+ else if (pfile->cur_token == pfile->cur_run->base) 1934+ { 1935+ return 0; 1936+ } 1937+ else 1938+ { 1939+ return pfile->cur_token[-1].src_loc; 1940+ } 1941+} 1942+ 1943 /* Print a diagnostic at the given location. */ 1944 1945 ATTRIBUTE_FPTR_PRINTF(5,0) 1946@@ -52,25 +77,7 @@ cpp_diagnostic (cpp_reader * pfile, enum 1947 enum cpp_warning_reason reason, 1948 const char *msgid, va_list *ap) 1949 { 1950- location_t src_loc; 1951- 1952- if (CPP_OPTION (pfile, traditional)) 1953- { 1954- if (pfile->state.in_directive) 1955- src_loc = pfile->directive_line; 1956- else 1957- src_loc = pfile->line_table->highest_line; 1958- } 1959- /* We don't want to refer to a token before the beginning of the 1960- current run -- that is invalid. */ 1961- else if (pfile->cur_token == pfile->cur_run->base) 1962- { 1963- src_loc = 0; 1964- } 1965- else 1966- { 1967- src_loc = pfile->cur_token[-1].src_loc; 1968- } 1969+ location_t src_loc = cpp_diagnostic_get_current_location (pfile); 1970 rich_location richloc (pfile->line_table, src_loc); 1971 return cpp_diagnostic_at (pfile, level, reason, &richloc, msgid, ap); 1972 } 1973@@ -142,6 +149,43 @@ cpp_warning_syshdr (cpp_reader * pfile, 1974 1975 va_end (ap); 1976 return ret; 1977+} 1978+ 1979+/* As cpp_warning above, but use RICHLOC as the location of the diagnostic. */ 1980+ 1981+bool cpp_warning_at (cpp_reader *pfile, enum cpp_warning_reason reason, 1982+ rich_location *richloc, const char *msgid, ...) 1983+{ 1984+ va_list ap; 1985+ bool ret; 1986+ 1987+ va_start (ap, msgid); 1988+ 1989+ ret = cpp_diagnostic_at (pfile, CPP_DL_WARNING, reason, richloc, 1990+ msgid, &ap); 1991+ 1992+ va_end (ap); 1993+ return ret; 1994+ 1995+} 1996+ 1997+/* As cpp_pedwarning above, but use RICHLOC as the location of the 1998+ diagnostic. */ 1999+ 2000+bool 2001+cpp_pedwarning_at (cpp_reader * pfile, enum cpp_warning_reason reason, 2002+ rich_location *richloc, const char *msgid, ...) 2003+{ 2004+ va_list ap; 2005+ bool ret; 2006+ 2007+ va_start (ap, msgid); 2008+ 2009+ ret = cpp_diagnostic_at (pfile, CPP_DL_PEDWARN, reason, richloc, 2010+ msgid, &ap); 2011+ 2012+ va_end (ap); 2013+ return ret; 2014 } 2015 2016 /* Print a diagnostic at a specific location. */ 2017diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h 2018--- a/libcpp/include/cpplib.h 2021-12-13 23:23:05.768437079 -0800 2019+++ b/libcpp/include/cpplib.h 2021-12-14 01:20:16.189507386 -0800 2020@@ -1275,6 +1275,14 @@ extern bool cpp_warning_syshdr (cpp_read 2021 const char *msgid, ...) 2022 ATTRIBUTE_PRINTF_3; 2023 2024+/* As their counterparts above, but use RICHLOC. */ 2025+extern bool cpp_warning_at (cpp_reader *, enum cpp_warning_reason, 2026+ rich_location *richloc, const char *msgid, ...) 2027+ ATTRIBUTE_PRINTF_4; 2028+extern bool cpp_pedwarning_at (cpp_reader *, enum cpp_warning_reason, 2029+ rich_location *richloc, const char *msgid, ...) 2030+ ATTRIBUTE_PRINTF_4; 2031+ 2032 /* Output a diagnostic with "MSGID: " preceding the 2033 error string of errno. No location is printed. */ 2034 extern bool cpp_errno (cpp_reader *, enum cpp_diagnostic_level, 2035@@ -1435,42 +1443,95 @@ extern const char * cpp_get_userdef_suff 2036 2037 /* In charset.c */ 2038 2039+/* The result of attempting to decode a run of UTF-8 bytes. */ 2040+ 2041+struct cpp_decoded_char 2042+{ 2043+ const char *m_start_byte; 2044+ const char *m_next_byte; 2045+ 2046+ bool m_valid_ch; 2047+ cppchar_t m_ch; 2048+}; 2049+ 2050+/* Information for mapping between code points and display columns. 2051+ 2052+ This is a tabstop value, along with a callback for getting the 2053+ widths of characters. Normally this callback is cpp_wcwidth, but we 2054+ support other schemes for escaping non-ASCII unicode as a series of 2055+ ASCII chars when printing the user's source code in diagnostic-show-locus.c 2056+ 2057+ For example, consider: 2058+ - the Unicode character U+03C0 "GREEK SMALL LETTER PI" (UTF-8: 0xCF 0x80) 2059+ - the Unicode character U+1F642 "SLIGHTLY SMILING FACE" 2060+ (UTF-8: 0xF0 0x9F 0x99 0x82) 2061+ - the byte 0xBF (a stray trailing byte of a UTF-8 character) 2062+ Normally U+03C0 would occupy one display column, U+1F642 2063+ would occupy two display columns, and the stray byte would be 2064+ printed verbatim as one display column. 2065+ 2066+ However when escaping them as unicode code points as "<U+03C0>" 2067+ and "<U+1F642>" they occupy 8 and 9 display columns respectively, 2068+ and when escaping them as bytes as "<CF><80>" and "<F0><9F><99><82>" 2069+ they occupy 8 and 16 display columns respectively. In both cases 2070+ the stray byte is escaped to <BF> as 4 display columns. */ 2071+ 2072+struct cpp_char_column_policy 2073+{ 2074+ cpp_char_column_policy (int tabstop, 2075+ int (*width_cb) (cppchar_t c)) 2076+ : m_tabstop (tabstop), 2077+ m_undecoded_byte_width (1), 2078+ m_width_cb (width_cb) 2079+ {} 2080+ 2081+ int m_tabstop; 2082+ /* Width in display columns of a stray byte that isn't decodable 2083+ as UTF-8. */ 2084+ int m_undecoded_byte_width; 2085+ int (*m_width_cb) (cppchar_t c); 2086+}; 2087+ 2088 /* A class to manage the state while converting a UTF-8 sequence to cppchar_t 2089 and computing the display width one character at a time. */ 2090 class cpp_display_width_computation { 2091 public: 2092 cpp_display_width_computation (const char *data, int data_length, 2093- int tabstop); 2094+ const cpp_char_column_policy &policy); 2095 const char *next_byte () const { return m_next; } 2096 int bytes_processed () const { return m_next - m_begin; } 2097 int bytes_left () const { return m_bytes_left; } 2098 bool done () const { return !bytes_left (); } 2099 int display_cols_processed () const { return m_display_cols; } 2100 2101- int process_next_codepoint (); 2102+ int process_next_codepoint (cpp_decoded_char *out); 2103 int advance_display_cols (int n); 2104 2105 private: 2106 const char *const m_begin; 2107 const char *m_next; 2108 size_t m_bytes_left; 2109- const int m_tabstop; 2110+ const cpp_char_column_policy &m_policy; 2111 int m_display_cols; 2112 }; 2113 2114 /* Convenience functions that are simple use cases for class 2115 cpp_display_width_computation. Tab characters will be expanded to spaces 2116- as determined by TABSTOP. */ 2117+ as determined by POLICY.m_tabstop, and non-printable-ASCII characters 2118+ will be escaped as per POLICY. */ 2119+ 2120 int cpp_byte_column_to_display_column (const char *data, int data_length, 2121- int column, int tabstop); 2122+ int column, 2123+ const cpp_char_column_policy &policy); 2124 inline int cpp_display_width (const char *data, int data_length, 2125- int tabstop) 2126+ const cpp_char_column_policy &policy) 2127 { 2128 return cpp_byte_column_to_display_column (data, data_length, data_length, 2129- tabstop); 2130+ policy); 2131 } 2132 int cpp_display_column_to_byte_column (const char *data, int data_length, 2133- int display_col, int tabstop); 2134+ int display_col, 2135+ const cpp_char_column_policy &policy); 2136 int cpp_wcwidth (cppchar_t c); 2137 2138 #endif /* ! LIBCPP_CPPLIB_H */ 2139diff --git a/libcpp/include/line-map.h b/libcpp/include/line-map.h 2140--- a/libcpp/include/line-map.h 2021-07-27 23:55:08.716307283 -0700 2141+++ b/libcpp/include/line-map.h 2021-12-14 01:16:01.557942991 -0800 2142@@ -1781,6 +1781,18 @@ class rich_location 2143 const diagnostic_path *get_path () const { return m_path; } 2144 void set_path (const diagnostic_path *path) { m_path = path; } 2145 2146+ /* A flag for hinting that the diagnostic involves character encoding 2147+ issues, and thus that it will be helpful to the user if we show some 2148+ representation of how the characters in the pertinent source lines 2149+ are encoded. 2150+ The default is false (i.e. do not escape). 2151+ When set to true, non-ASCII bytes in the pertinent source lines will 2152+ be escaped in a manner controlled by the user-supplied option 2153+ -fdiagnostics-escape-format=, so that the user can better understand 2154+ what's going on with the encoding in their source file. */ 2155+ bool escape_on_output_p () const { return m_escape_on_output; } 2156+ void set_escape_on_output (bool flag) { m_escape_on_output = flag; } 2157+ 2158 private: 2159 bool reject_impossible_fixit (location_t where); 2160 void stop_supporting_fixits (); 2161@@ -1807,6 +1819,7 @@ protected: 2162 bool m_fixits_cannot_be_auto_applied; 2163 2164 const diagnostic_path *m_path; 2165+ bool m_escape_on_output; 2166 }; 2167 2168 /* A struct for the result of range_label::get_text: a NUL-terminated buffer 2169diff --git a/libcpp/internal.h b/libcpp/internal.h 2170--- a/libcpp/internal.h 2021-12-13 23:23:05.768437079 -0800 2171+++ b/libcpp/internal.h 2021-12-14 01:16:01.557942991 -0800 2172@@ -776,6 +776,9 @@ extern void _cpp_do_file_change (cpp_rea 2173 extern void _cpp_pop_buffer (cpp_reader *); 2174 extern char *_cpp_bracket_include (cpp_reader *); 2175 2176+/* In errors.c */ 2177+extern location_t cpp_diagnostic_get_current_location (cpp_reader *); 2178+ 2179 /* In traditional.c. */ 2180 extern bool _cpp_scan_out_logical_line (cpp_reader *, cpp_macro *, bool); 2181 extern bool _cpp_read_logical_line_trad (cpp_reader *); 2182@@ -942,6 +945,26 @@ int linemap_get_expansion_line (class li 2183 const char* linemap_get_expansion_filename (class line_maps *, 2184 location_t); 2185 2186+/* A subclass of rich_location for emitting a diagnostic 2187+ at the current location of the reader, but flagging 2188+ it with set_escape_on_output (true). */ 2189+class encoding_rich_location : public rich_location 2190+{ 2191+ public: 2192+ encoding_rich_location (cpp_reader *pfile) 2193+ : rich_location (pfile->line_table, 2194+ cpp_diagnostic_get_current_location (pfile)) 2195+ { 2196+ set_escape_on_output (true); 2197+ } 2198+ 2199+ encoding_rich_location (cpp_reader *pfile, location_t loc) 2200+ : rich_location (pfile->line_table, loc) 2201+ { 2202+ set_escape_on_output (true); 2203+ } 2204+}; 2205+ 2206 #ifdef __cplusplus 2207 } 2208 #endif 2209diff --git a/libcpp/lex.c b/libcpp/lex.c 2210--- a/libcpp/lex.c 2021-12-14 01:14:48.435225968 -0800 2211+++ b/libcpp/lex.c 2021-12-14 01:24:37.220995816 -0800 2212@@ -1774,7 +1774,11 @@ skip_whitespace (cpp_reader *pfile, cppc 2213 while (is_nvspace (c)); 2214 2215 if (saw_NUL) 2216- cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored"); 2217+ { 2218+ encoding_rich_location rich_loc (pfile); 2219+ cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc, 2220+ "null character(s) ignored"); 2221+ } 2222 2223 buffer->cur--; 2224 } 2225@@ -1803,6 +1807,28 @@ warn_about_normalization (cpp_reader *pf 2226 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s) 2227 && !pfile->state.skipping) 2228 { 2229+ location_t loc = token->src_loc; 2230+ 2231+ /* If possible, create a location range for the token. */ 2232+ if (loc >= RESERVED_LOCATION_COUNT 2233+ && token->type != CPP_EOF 2234+ /* There must be no line notes to process. */ 2235+ && (!(pfile->buffer->cur 2236+ >= pfile->buffer->notes[pfile->buffer->cur_note].pos 2237+ && !pfile->overlaid_buffer))) 2238+ { 2239+ source_range tok_range; 2240+ tok_range.m_start = loc; 2241+ tok_range.m_finish 2242+ = linemap_position_for_column (pfile->line_table, 2243+ CPP_BUF_COLUMN (pfile->buffer, 2244+ pfile->buffer->cur)); 2245+ loc = COMBINE_LOCATION_DATA (pfile->line_table, 2246+ loc, tok_range, NULL); 2247+ } 2248+ 2249+ encoding_rich_location rich_loc (pfile, loc); 2250+ 2251 /* Make sure that the token is printed using UCNs, even 2252 if we'd otherwise happily print UTF-8. */ 2253 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token)); 2254@@ -1810,11 +1836,11 @@ warn_about_normalization (cpp_reader *pf 2255 2256 sz = cpp_spell_token (pfile, token, buf, false) - buf; 2257 if (NORMALIZE_STATE_RESULT (s) == normalized_C) 2258- cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, 2259- "`%.*s' is not in NFKC", (int) sz, buf); 2260+ cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc, 2261+ "`%.*s' is not in NFKC", (int) sz, buf); 2262 else 2263- cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, 2264- "`%.*s' is not in NFC", (int) sz, buf); 2265+ cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc, 2266+ "`%.*s' is not in NFC", (int) sz, buf); 2267 free (buf); 2268 } 2269 } 2270diff --git a/libcpp/line-map.c b/libcpp/line-map.c 2271--- a/libcpp/line-map.c 2021-07-27 23:55:08.716307283 -0700 2272+++ b/libcpp/line-map.c 2021-12-14 01:16:01.561942921 -0800 2273@@ -2086,7 +2086,8 @@ rich_location::rich_location (line_maps 2274 m_fixit_hints (), 2275 m_seen_impossible_fixit (false), 2276 m_fixits_cannot_be_auto_applied (false), 2277- m_path (NULL) 2278+ m_path (NULL), 2279+ m_escape_on_output (false) 2280 { 2281 add_range (loc, SHOW_RANGE_WITH_CARET, label); 2282 } 2283