libdap++  Updated for version 3.8.2
escaping.cc
Go to the documentation of this file.
1 
2 // -*- mode: c++; c-basic-offset:4 -*-
3 
4 // This file is part of libdap, A C++ implementation of the OPeNDAP Data
5 // Access Protocol.
6 
7 // Copyright (c) 2002,2003 OPeNDAP, Inc.
8 // Author: James Gallagher <jgallagher@opendap.org>
9 //
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 //
24 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
25 
26 // Copyright (c) 1996, California Institute of Technology.
27 // ALL RIGHTS RESERVED. U.S. Government Sponsorship acknowledged.
28 //
29 // Please read the full copyright notice in the file COPYRIGHT_URI
30 // in this directory.
31 //
32 // Author: Todd Karakashian, NASA/Jet Propulsion Laboratory
33 // Todd.K.Karakashian@jpl.nasa.gov
34 //
35 // $RCSfile: escaping.cc,v $ - Miscellaneous routines for OPeNDAP HDF server
36 //
37 // These two routines are for escaping/unescaping strings that are identifiers
38 // in DAP2
39 // id2www() -- escape (using WWW hex codes) non-allowable characters in a
40 // DAP2 identifier
41 // www2id() -- given an WWW hexcode escaped identifier, restore it
42 //
43 // These two routines are for escaping/unescaping strings storing attribute
44 // values. They use traditional octal escapes (\nnn) because they are
45 // intended to be viewed by a user
46 // escattr() -- escape (using traditional octal backslash) non-allowable
47 // characters in the value of a DAP2 attribute
48 // unescattr() -- given an octally escaped string, restore it
49 //
50 // These are routines used by the above, not intended to be called directly:
51 //
52 // hexstring()
53 // unhexstring()
54 // octstring()
55 // unoctstring()
56 //
57 // -Todd
58 
59 #include <ctype.h>
60 
61 #include <iomanip>
62 #include <string>
63 #include <sstream>
64 
65 #include "GNURegex.h"
66 #include "Error.h"
67 #include "InternalErr.h"
68 //#define DODS_DEBUG
69 #include "debug.h"
70 
71 using namespace std;
72 
73 namespace libdap {
74 
75 // The next four functions were originally defined static, but I removed that
76 // to make testing them (see generalUtilTest.cc) easier to write. 5/7/2001
77 // jhrg
78 
79 string
80 hexstring(unsigned char val)
81 {
82  ostringstream buf;
83  buf << hex << setw(2) << setfill('0') << static_cast<unsigned int>(val);
84 
85  return buf.str();
86 }
87 
88 string
89 unhexstring(string s)
90 {
91  int val;
92  istringstream ss(s);
93  ss >> hex >> val;
94  char tmp_str[2];
95  tmp_str[0] = static_cast<char>(val);
96  tmp_str[1] = '\0';
97  return string(tmp_str);
98 }
99 
100 string
101 octstring(unsigned char val)
102 {
103  ostringstream buf;
104  buf << oct << setw(3) << setfill('0')
105  << static_cast<unsigned int>(val);
106 
107  return buf.str();
108 }
109 
110 string
111 unoctstring(string s)
112 {
113  int val;
114 
115  istringstream ss(s);
116  ss >> oct >> val;
117 
118  DBG(cerr << "unoctstring: " << val << endl);
119 
120  char tmp_str[2];
121  tmp_str[0] = static_cast<char>(val);
122  tmp_str[1] = '\0';
123  return string(tmp_str);
124 }
125 
150 string
151 id2www(string in, const string &allowable)
152 {
153  string::size_type i = 0;
154  DBG(cerr<<"Input string: [" << in << "]" << endl);
155  while ((i = in.find_first_not_of(allowable, i)) != string::npos) {
156  DBG(cerr<<"Found escapee: [" << in[i] << "]");
157  in.replace(i, 1, "%" + hexstring(in[i]));
158  DBGN(cerr<<" now the string is: " << in << endl);
159  i += 3;//i++;
160  }
161 
162  return in;
163 }
164 
175 string
176 id2www_ce(string in, const string &allowable)
177 {
178  return id2www(in, allowable);
179 }
180 
213 string
214 www2id(const string &in, const string &escape, const string &except)
215 {
216  string::size_type i = 0;
217  string res = in;
218  while ((i = res.find_first_of(escape, i)) != string::npos) {
219  if (except.find(res.substr(i, 3)) != string::npos) {
220  i += 3;
221  continue;
222  }
223  res.replace(i, 3, unhexstring(res.substr(i + 1, 2)));
224  ++i;
225  }
226 
227  return res;
228 }
229 
230 static string
231 entity(char c)
232 {
233  switch (c) {
234  case '>': return "&gt;";
235  case '<': return "&lt;";
236  case '&': return "&amp;";
237  case '\'': return "&apos;";
238  case '\"': return "&quot;";
239  default:
240  throw InternalErr(__FILE__, __LINE__, "Unrecognized character.");
241  }
242 }
243 
244 // Assumption: There are always exactly two octal digits in the input
245 // and two hex digits in the result.
246 string
247 octal_to_hex(const string &octal_digits)
248 {
249  int val;
250 
251  istringstream ss(octal_digits);
252  ss >> oct >> val;
253 
254  ostringstream ds;
255  ds << hex << setw(2) << setfill('0') << val;
256  return ds.str();
257 }
258 
265 string
266 id2xml(string in, const string &not_allowed)
267 {
268  string::size_type i = 0;
269 
270  while ((i = in.find_first_of(not_allowed, i)) != string::npos) {
271  in.replace(i, 1, entity(in[i]));
272  ++i;
273  }
274 #if 0
275  // Removed the encoding of octal escapes. This function is used by
276  // AttrTable to encode the stuff that is the value of the <value>
277  // element in the DDX. The problem is that some of the values are not
278  // valid UTF-8 and that makes a XML parser gag.; ticket 1512.
279  // jhrg 3/19/10
280 
281  // OK, now scan for octal escape sequences like \\012 (where the '\'
282  // is itself escaped). This type of attribute value comes from the netCDF
283  // handler and maybe others. Assumption: The '\' will always appear as
284  // in its escaped form: '\\'. NB: Both backslashes must be escaped in the
285  // C++ string.
286  string octal_escape = "\\\\";
287  i = 0;
288  string::size_type length = in.length();
289  while ((i = in.find(octal_escape, i)) != string::npos) {
290  // Get the three octal digits following the '\\0'
291  string::size_type j = i + 2;
292  if (j + 1 >= length) // Check that we're not past the end
293  break;
294  string octal_digits = in.substr(j, 3);
295  // convert to a &#xdd; XML escape
296  string hex_escape = string("&#x");
297  hex_escape.append(octal_to_hex(octal_digits));
298  hex_escape.append(string(";"));
299 
300  // replace the octal escape with an XML/hex escape
301  in.replace(i, 5, hex_escape);
302 
303  // increment i
304  i += 6;
305  }
306 #endif
307  return in;
308 }
309 
315 string
316 xml2id(string in)
317 {
318  string::size_type i = 0;
319 
320  while ((i = in.find("&gt;", i)) != string::npos)
321  in.replace(i, 4, ">");
322 
323  i = 0;
324  while ((i = in.find("&lt;", i)) != string::npos)
325  in.replace(i, 4, "<");
326 
327  i = 0;
328  while ((i = in.find("&amp;", i)) != string::npos)
329  in.replace(i, 5, "&");
330 
331  i = 0;
332  while ((i = in.find("&apos;", i)) != string::npos)
333  in.replace(i, 6, "'");
334 
335  i = 0;
336  while ((i = in.find("&quot;", i)) != string::npos)
337  in.replace(i, 6, "\"");
338 
339  return in;
340 }
341 
347 string
348 esc2underscore(string s)
349 {
350  string::size_type pos;
351  while ((pos = s.find('%')) != string::npos)
352  s.replace(pos, 3, "_");
353 
354  return s;
355 }
356 
357 
361 string
362 escattr(string s)
363 {
364  const string printable = " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789~`!@#$%^&*()_-+={[}]|\\:;<,>.?/'\"";
365  const string ESC = "\\";
366  const string DOUBLE_ESC = ESC + ESC;
367  const string QUOTE = "\"";
368  const string ESCQUOTE = ESC + QUOTE;
369 
370  // escape non-printing characters with octal escape
371  string::size_type ind = 0;
372  while ((ind = s.find_first_not_of(printable, ind)) != s.npos)
373  s.replace(ind, 1, ESC + octstring(s[ind]));
374 
375  // escape \ with a second backslash
376  ind = 0;
377  while ((ind = s.find(ESC, ind)) != s.npos) {
378  s.replace(ind, 1, DOUBLE_ESC);
379  ind += DOUBLE_ESC.length();
380  }
381 
382  // escape " with backslash
383  ind = 0;
384  while ((ind = s.find(QUOTE, ind)) != s.npos) {
385  s.replace(ind, 1, ESCQUOTE);
386  ind += ESCQUOTE.length();
387  }
388 
389  return s;
390 }
391 
400 string
401 unescattr(string s)
402 {
403  Regex octal("\\\\[0-3][0-7][0-7]"); // matches 4 characters
404  Regex esc_quote("\\\\\""); // matches 3 characters
405  Regex esc_esc("\\\\\\\\"); // matches 2 characters
406  const string ESC = "\\";
407  const string QUOTE = "\"";
408  int matchlen;
409  unsigned int index;
410 
411  DBG(cerr << "0XX" << s << "XXX" << endl);
412  // unescape any escaped backslashes
413  index = esc_esc.search(s.c_str(), s.length(), matchlen, 0);
414  while (index < s.length()) {
415  DBG(cerr << "1aXX" << s << "XXX index: " << index << endl);
416  s.replace(index, 2, ESC);
417  DBG(cerr << "1bXX" << s << "XXX index: " << index << endl);
418  index = esc_esc.search(s.c_str(), s.length(), matchlen, 0);
419  }
420 
421  // unescape any escaped double quote characters
422  index = esc_quote.search(s.c_str(), s.length(), matchlen, 0);
423  while (index < s.length()) {
424  s.replace(index, 2, QUOTE);
425  DBG(cerr << "2XX" << s << "XXX index: " << index << endl);
426  index = esc_quote.search(s.c_str(), s.length(), matchlen, 0);
427  }
428 
429  // unescape octal characters
430  index = octal.search(s.c_str(), s.length(), matchlen, 0);
431  while (index < s.length()) {
432  s.replace(index, 4, unoctstring(s.substr(index + 1, 3)));
433  DBG(cerr << "3XX" << s << "XXX index: " << index << endl);
434  index = octal.search(s.c_str(), s.length(), matchlen, 0);
435  }
436 
437  DBG(cerr << "4XX" << s << "XXX" << endl);
438  return s;
439 }
440 
441 string
443 {
444  // First, add enclosing quotes if needed.
445  if (*msg.begin() != '"')
446  msg.insert(msg.begin(), '"');
447  if (*(msg.end() - 1) != '"')
448  msg += "\"";
449 
450  // Now escape any internal double quotes that aren't escaped.
451  string::iterator miter;
452  for (miter = msg.begin() + 1; miter != msg.end() - 1; miter++)
453  if (*miter == '"' && *(miter - 1) != '\\')
454  miter = msg.insert(miter, '\\');
455 
456  return msg;
457 }
458 
463 string
464 escape_double_quotes(string source)
465 {
466  string::size_type idx = 0;
467  while((idx = source.find('\"', idx)) != string::npos) {
468  source.replace(idx, 1, "\\\""); // a backslash and a double quote
469  idx += 2;
470  }
471 
472  return source;
473 }
474 
480 string
482 {
483  string::size_type idx = 0;
484  while((idx = source.find("\\\"", idx)) != string::npos) {
485  source.replace(idx, 2, "\""); // a backslash and a double quote
486  ++idx;
487  }
488 
489  return source;
490 }
491 
492 } // namespace libdap
493 
string id2www_ce(string in, const string &allowable)
Definition: escaping.cc:176
#define DBGN(x)
Definition: debug.h:59
string id2xml(string in, const string &not_allowed)
Definition: escaping.cc:266
string escape_double_quotes(string source)
Definition: escaping.cc:464
string octal_to_hex(const string &octal_digits)
Definition: escaping.cc:247
A class for software fault reporting.
Definition: InternalErr.h:64
string unoctstring(string s)
Definition: escaping.cc:111
#define DBG(x)
Definition: debug.h:58
string munge_error_message(string msg)
Definition: escaping.cc:442
string xml2id(string in)
Definition: escaping.cc:316
string www2id(const string &in, const string &escape, const string &except)
Definition: escaping.cc:214
string esc2underscore(string s)
Definition: escaping.cc:348
string unhexstring(string s)
Definition: escaping.cc:89
string hexstring(unsigned char val)
Definition: escaping.cc:80
string octstring(unsigned char val)
Definition: escaping.cc:101
string unescattr(string s)
Definition: escaping.cc:401
string id2www(string in, const string &allowable)
Definition: escaping.cc:151
string unescape_double_quotes(string source)
Definition: escaping.cc:481
string escattr(string s)
Definition: escaping.cc:362