Files

Addressable::IDNA

A pure Ruby implementation of IDNA. C was eschewed for the sake of JRuby, and because performance is largely irrelevant here.

Constants

HANGUL_SBASE
HANGUL_LBASE
HANGUL_LCOUNT
HANGUL_VBASE
HANGUL_VCOUNT
HANGUL_TBASE
HANGUL_TCOUNT
HANGUL_NCOUNT
HANGUL_SCOUNT
UNICODE_DATA_COMBINING_CLASS
UNICODE_DATA_EXCLUSION
UNICODE_DATA_CANONICAL
UNICODE_DATA_COMPATIBILITY
UNICODE_DATA_UPPERCASE
UNICODE_DATA_LOWERCASE
UNICODE_DATA_TITLECASE
UNICODE_DATA

This is a sparse Unicode table. Codepoints without entries are assumed to have the value: [0, 0, nil, nil, nil, nil, nil]

COMPOSITION_TABLE
UNICODE_MAX_LENGTH
ACE_MAX_LENGTH
PUNYCODE_BASE
PUNYCODE_TMIN
PUNYCODE_TMAX
PUNYCODE_SKEW
PUNYCODE_DAMP
PUNYCODE_INITIAL_BIAS
PUNYCODE_INITIAL_N
PUNYCODE_DELIMITER
PUNYCODE_MAXINT
PUNYCODE_PRINT_ASCII

Public Class Methods

to_ascii(input) click to toggle source

Converts from a Unicode internationalized domain name to an ASCII domain name as described in RFC 3490.

    # File lib/addressable/idna.rb, line 47
47:     def self.to_ascii(input)
48:       input = input.dup
49:       if input.respond_to?(:force_encoding)
50:         input.force_encoding(Encoding::ASCII_8BIT)
51:       end
52:       if input =~ UTF8_REGEX && input =~ UTF8_REGEX_MULTIBYTE
53:         parts = unicode_downcase(input).split('.')
54:         parts.map! do |part|
55:           if part.respond_to?(:force_encoding)
56:             part.force_encoding(Encoding::ASCII_8BIT)
57:           end
58:           if part =~ UTF8_REGEX && part =~ UTF8_REGEX_MULTIBYTE
59:             ACE_PREFIX + punycode_encode(unicode_normalize_kc(part))
60:           else
61:             part
62:           end
63:         end
64:         parts.join('.')
65:       else
66:         input
67:       end
68:     end
to_unicode(input) click to toggle source

Converts from an ASCII domain name to a Unicode internationalized domain name as described in RFC 3490.

    # File lib/addressable/idna.rb, line 72
72:     def self.to_unicode(input)
73:       parts = input.split('.')
74:       parts.map! do |part|
75:         if part =~ /^#{ACE_PREFIX}/
76:           punycode_decode(part[/^#{ACE_PREFIX}(.+)/, 1])
77:         else
78:           part
79:         end
80:       end
81:       output = parts.join('.')
82:       if output.respond_to?(:force_encoding)
83:         output.force_encoding(Encoding::UTF_8)
84:       end
85:       output
86:     end
unicode_normalize_kc(input) click to toggle source

Unicode normalization form KC.

    # File lib/addressable/idna.rb, line 89
89:     def self.unicode_normalize_kc(input)
90:       unpacked = input.unpack("U*")
91:       unpacked =
92:         unicode_compose(unicode_sort_canonical(unicode_decompose(unpacked)))
93:       return unpacked.pack("U*")
94:     end

Private Class Methods

lookup_unicode_combining_class(codepoint) click to toggle source
     # File lib/addressable/idna.rb, line 255
255:     def self.lookup_unicode_combining_class(codepoint)
256:       codepoint_data = UNICODE_DATA[codepoint]
257:       (codepoint_data ?
258:         (codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) :
259:         0)
260:     end
lookup_unicode_compatibility(codepoint) click to toggle source
     # File lib/addressable/idna.rb, line 263
263:     def self.lookup_unicode_compatibility(codepoint)
264:       codepoint_data = UNICODE_DATA[codepoint]
265:       (codepoint_data ?
266:         codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil)
267:     end
lookup_unicode_composition(unpacked) click to toggle source
     # File lib/addressable/idna.rb, line 278
278:     def self.lookup_unicode_composition(unpacked)
279:       return COMPOSITION_TABLE[unpacked.pack("C*")]
280:     end
lookup_unicode_lowercase(codepoint) click to toggle source
     # File lib/addressable/idna.rb, line 270
270:     def self.lookup_unicode_lowercase(codepoint)
271:       codepoint_data = UNICODE_DATA[codepoint]
272:       (codepoint_data ?
273:         (codepoint_data[UNICODE_DATA_LOWERCASE] || codepoint) :
274:         codepoint)
275:     end
punycode_adapt(delta, numpoints, firsttime) click to toggle source

Bias adaptation method

      # File lib/addressable/idna.rb, line 4854
4854:     def self.punycode_adapt(delta, numpoints, firsttime)
4855:       delta = firsttime ? delta / PUNYCODE_DAMP : delta >> 1
4856:       # delta >> 1 is a faster way of doing delta / 2
4857:       delta += delta / numpoints
4858:       difference = PUNYCODE_BASE - PUNYCODE_TMIN
4859: 
4860:       k = 0
4861:       while delta > (difference * PUNYCODE_TMAX) / 2
4862:         delta /= difference
4863:         k += PUNYCODE_BASE
4864:       end
4865: 
4866:       k + (difference + 1) * delta / (delta + PUNYCODE_SKEW)
4867:     end
punycode_basic?(codepoint) click to toggle source
      # File lib/addressable/idna.rb, line 4822
4822:     def self.punycode_basic?(codepoint)
4823:       codepoint < 0x80
4824:     end
punycode_decode(punycode) click to toggle source
      # File lib/addressable/idna.rb, line 4700
4700:     def self.punycode_decode(punycode)
4701:       input = []
4702:       output = []
4703: 
4704:       if ACE_MAX_LENGTH * 2 < punycode.size
4705:         raise PunycodeBigOutput, "Output would exceed the space provided."
4706:       end
4707:       punycode.each_byte do |c|
4708:         unless c >= 0 && c <= 127
4709:           raise PunycodeBadInput, "Input is invalid."
4710:         end
4711:         input.push(c)
4712:       end
4713: 
4714:       input_length = input.length
4715:       output_length = [UNICODE_MAX_LENGTH]
4716: 
4717:       # Initialize the state
4718:       n = PUNYCODE_INITIAL_N
4719: 
4720:       out = i = 0
4721:       max_out = output_length[0]
4722:       bias = PUNYCODE_INITIAL_BIAS
4723: 
4724:       # Handle the basic code points:  Let b be the number of input code
4725:       # points before the last delimiter, or 0 if there is none, then
4726:       # copy the first b code points to the output.
4727: 
4728:       b = 0
4729:       input_length.times do |j|
4730:         b = j if punycode_delimiter?(input[j])
4731:       end
4732:       if b > max_out
4733:         raise PunycodeBigOutput, "Output would exceed the space provided."
4734:       end
4735: 
4736:       b.times do |j|
4737:         unless punycode_basic?(input[j])
4738:           raise PunycodeBadInput, "Input is invalid."
4739:         end
4740:         output[out] = input[j]
4741:         out+=1
4742:       end
4743: 
4744:       # Main decoding loop:  Start just after the last delimiter if any
4745:       # basic code points were copied; start at the beginning otherwise.
4746: 
4747:       in_ = b > 0 ? b + 1 : 0
4748:       while in_ < input_length
4749: 
4750:         # in_ is the index of the next character to be consumed, and
4751:         # out is the number of code points in the output array.
4752: 
4753:         # Decode a generalized variable-length integer into delta,
4754:         # which gets added to i.  The overflow checking is easier
4755:         # if we increase i as we go, then subtract off its starting
4756:         # value at the end to obtain delta.
4757: 
4758:         oldi = i; w = 1; k = PUNYCODE_BASE
4759:         while true
4760:           if in_ >= input_length
4761:             raise PunycodeBadInput, "Input is invalid."
4762:           end
4763:           digit = punycode_decode_digit(input[in_])
4764:           in_+=1
4765:           if digit >= PUNYCODE_BASE
4766:             raise PunycodeBadInput, "Input is invalid."
4767:           end
4768:           if digit > (PUNYCODE_MAXINT - i) / w
4769:             raise PunycodeOverflow, "Input needs wider integers to process."
4770:           end
4771:           i += digit * w
4772:           t = (
4773:             if k <= bias
4774:               PUNYCODE_TMIN
4775:             elsif k >= bias + PUNYCODE_TMAX
4776:               PUNYCODE_TMAX
4777:             else
4778:               k - bias
4779:             end
4780:           )
4781:           break if digit < t
4782:           if w > PUNYCODE_MAXINT / (PUNYCODE_BASE - t)
4783:             raise PunycodeOverflow, "Input needs wider integers to process."
4784:           end
4785:           w *= PUNYCODE_BASE - t
4786:           k += PUNYCODE_BASE
4787:         end
4788: 
4789:         bias = punycode_adapt(i - oldi, out + 1, oldi == 0)
4790: 
4791:         # I was supposed to wrap around from out + 1 to 0,
4792:         # incrementing n each time, so we'll fix that now:
4793: 
4794:         if i / (out + 1) > PUNYCODE_MAXINT - n
4795:           raise PunycodeOverflow, "Input needs wider integers to process."
4796:         end
4797:         n += i / (out + 1)
4798:         i = out + 1
4799: 
4800:         # Insert n at position i of the output:
4801: 
4802:         # not needed for Punycode:
4803:         # raise PUNYCODE_INVALID_INPUT if decode_digit(n) <= base
4804:         if out >= max_out
4805:           raise PunycodeBigOutput, "Output would exceed the space provided."
4806:         end
4807: 
4808:         #memmove(output + i + 1, output + i, (out - i) * sizeof *output)
4809:         output[i + 1, out - i] = output[i, out - i]
4810:         output[i] = n
4811:         i += 1
4812: 
4813:         out += 1
4814:       end
4815: 
4816:       output_length[0] = out
4817: 
4818:       output.pack("U*")
4819:     end
punycode_decode_digit(codepoint) click to toggle source

Returns the numeric value of a basic codepoint (for use in representing integers) in the range 0 to base - 1, or PUNYCODE_BASE if codepoint does not represent a value.

      # File lib/addressable/idna.rb, line 4840
4840:     def self.punycode_decode_digit(codepoint)
4841:       if codepoint - 48 < 10
4842:         codepoint - 22
4843:       elsif codepoint - 65 < 26
4844:         codepoint - 65
4845:       elsif codepoint - 97 < 26
4846:         codepoint - 97
4847:       else
4848:         PUNYCODE_BASE
4849:       end
4850:     end
punycode_delimiter?(codepoint) click to toggle source
      # File lib/addressable/idna.rb, line 4827
4827:     def self.punycode_delimiter?(codepoint)
4828:       codepoint == PUNYCODE_DELIMITER
4829:     end
punycode_encode(unicode) click to toggle source
      # File lib/addressable/idna.rb, line 4580
4580:     def self.punycode_encode(unicode)
4581:       input = unicode.unpack("U*")
4582:       output = [0] * (ACE_MAX_LENGTH + 1)
4583:       input_length = input.size
4584:       output_length = [ACE_MAX_LENGTH]
4585: 
4586:       # Initialize the state
4587:       n = PUNYCODE_INITIAL_N
4588:       delta = out = 0
4589:       max_out = output_length[0]
4590:       bias = PUNYCODE_INITIAL_BIAS
4591: 
4592:       # Handle the basic code points:
4593:       input_length.times do |j|
4594:         if punycode_basic?(input[j])
4595:           if max_out - out < 2
4596:             raise PunycodeBigOutput,
4597:               "Output would exceed the space provided."
4598:           end
4599:           output[out] = input[j]
4600:           out += 1
4601:         end
4602:       end
4603: 
4604:       h = b = out
4605: 
4606:       # h is the number of code points that have been handled, b is the
4607:       # number of basic code points, and out is the number of characters
4608:       # that have been output.
4609: 
4610:       if b > 0
4611:         output[out] = PUNYCODE_DELIMITER
4612:         out += 1
4613:       end
4614: 
4615:       # Main encoding loop:
4616: 
4617:       while h < input_length
4618:         # All non-basic code points < n have been
4619:         # handled already.  Find the next larger one:
4620: 
4621:         m = PUNYCODE_MAXINT
4622:         input_length.times do |j|
4623:           m = input[j] if (n...m) === input[j]
4624:         end
4625: 
4626:         # Increase delta enough to advance the decoder's
4627:         # <n,i> state to <m,0>, but guard against overflow:
4628: 
4629:         if m - n > (PUNYCODE_MAXINT - delta) / (h + 1)
4630:           raise PunycodeOverflow, "Input needs wider integers to process."
4631:         end
4632:         delta += (m - n) * (h + 1)
4633:         n = m
4634: 
4635:         input_length.times do |j|
4636:           # Punycode does not need to check whether input[j] is basic:
4637:           if input[j] < n
4638:             delta += 1
4639:             if delta == 0
4640:               raise PunycodeOverflow,
4641:                 "Input needs wider integers to process."
4642:             end
4643:           end
4644: 
4645:           if input[j] == n
4646:             # Represent delta as a generalized variable-length integer:
4647: 
4648:             q = delta; k = PUNYCODE_BASE
4649:             while true
4650:               if out >= max_out
4651:                 raise PunycodeBigOutput,
4652:                   "Output would exceed the space provided."
4653:               end
4654:               t = (
4655:                 if k <= bias
4656:                   PUNYCODE_TMIN
4657:                 elsif k >= bias + PUNYCODE_TMAX
4658:                   PUNYCODE_TMAX
4659:                 else
4660:                   k - bias
4661:                 end
4662:               )
4663:               break if q < t
4664:               output[out] =
4665:                 punycode_encode_digit(t + (q - t) % (PUNYCODE_BASE - t))
4666:               out += 1
4667:               q = (q - t) / (PUNYCODE_BASE - t)
4668:               k += PUNYCODE_BASE
4669:             end
4670: 
4671:             output[out] = punycode_encode_digit(q)
4672:             out += 1
4673:             bias = punycode_adapt(delta, h + 1, h == b)
4674:             delta = 0
4675:             h += 1
4676:           end
4677:         end
4678: 
4679:         delta += 1
4680:         n += 1
4681:       end
4682: 
4683:       output_length[0] = out
4684: 
4685:       outlen = out
4686:       outlen.times do |j|
4687:         c = output[j]
4688:         unless c >= 0 && c <= 127
4689:           raise Exception, "Invalid output char."
4690:         end
4691:         unless PUNYCODE_PRINT_ASCII[c]
4692:           raise PunycodeBadInput, "Input is invalid."
4693:         end
4694:       end
4695: 
4696:       output[0..outlen].map { |x| x.chr }.join("").sub(/\00++\z/, "")
4697:     end
punycode_encode_digit(d) click to toggle source
      # File lib/addressable/idna.rb, line 4832
4832:     def self.punycode_encode_digit(d)
4833:       d + 22 + 75 * ((d < 26) ? 1 : 0)
4834:     end
unicode_compose(unpacked) click to toggle source
     # File lib/addressable/idna.rb, line 110
110:     def self.unicode_compose(unpacked)
111:       unpacked_result = []
112:       length = unpacked.length
113: 
114:       return unpacked if length == 0
115: 
116:       starter = unpacked[0]
117:       starter_cc = lookup_unicode_combining_class(starter)
118:       starter_cc = 256 if starter_cc != 0
119:       for i in 1...length
120:         ch = unpacked[i]
121:         cc = lookup_unicode_combining_class(ch)
122: 
123:         if (starter_cc == 0 &&
124:             (composite = unicode_compose_pair(starter, ch)) != nil)
125:           starter = composite
126:           startercc = lookup_unicode_combining_class(composite)
127:         else
128:           unpacked_result << starter
129:           starter = ch
130:           startercc = cc
131:         end
132:       end
133:       unpacked_result << starter
134:       return unpacked_result
135:     end
unicode_compose_pair(ch_one, ch_two) click to toggle source
     # File lib/addressable/idna.rb, line 138
138:     def self.unicode_compose_pair(ch_one, ch_two)
139:       if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT &&
140:           ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT
141:         # Hangul L + V
142:         return HANGUL_SBASE + (
143:           (ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE)
144:         ) * HANGUL_TCOUNT
145:       elsif ch_one >= HANGUL_SBASE &&
146:           ch_one < HANGUL_SBASE + HANGUL_SCOUNT &&
147:           (ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 &&
148:           ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT
149:            # Hangul LV + T
150:         return ch_one + (ch_two - HANGUL_TBASE)
151:       end
152: 
153:       p = []
154:       ucs4_to_utf8 = lambda do |ch|
155:         # For some reason, rcov likes to drop BUS errors here.
156:         if ch < 128
157:           p << ch
158:         elsif ch < 2048
159:           p << (ch >> 6 | 192)
160:           p << (ch & 63 | 128)
161:         elsif ch < 0x10000
162:           p << (ch >> 12 | 224)
163:           p << (ch >> 6 & 63 | 128)
164:           p << (ch & 63 | 128)
165:         elsif ch < 0x200000
166:           p << (ch >> 18 | 240)
167:           p << (ch >> 12 & 63 | 128)
168:           p << (ch >> 6 & 63 | 128)
169:           p << (ch & 63 | 128)
170:         elsif ch < 0x4000000
171:           p << (ch >> 24 | 248)
172:           p << (ch >> 18 & 63 | 128)
173:           p << (ch >> 12 & 63 | 128)
174:           p << (ch >> 6 & 63 | 128)
175:           p << (ch & 63 | 128)
176:         elsif ch < 0x80000000
177:           p << (ch >> 30 | 252)
178:           p << (ch >> 24 & 63 | 128)
179:           p << (ch >> 18 & 63 | 128)
180:           p << (ch >> 12 & 63 | 128)
181:           p << (ch >> 6 & 63 | 128)
182:           p << (ch & 63 | 128)
183:         end
184:       end
185: 
186:       ucs4_to_utf8.call(ch_one)
187:       ucs4_to_utf8.call(ch_two)
188: 
189:       return lookup_unicode_composition(p)
190:     end
unicode_decompose(unpacked) click to toggle source
     # File lib/addressable/idna.rb, line 217
217:     def self.unicode_decompose(unpacked)
218:       unpacked_result = []
219:       for cp in unpacked
220:         if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT
221:           l, v, t = unicode_decompose_hangul(cp)
222:           unpacked_result << l
223:           unpacked_result << v if v
224:           unpacked_result << t if t
225:         else
226:           dc = lookup_unicode_compatibility(cp)
227:           unless dc
228:             unpacked_result << cp
229:           else
230:             unpacked_result.concat(unicode_decompose(dc.unpack("U*")))
231:           end
232:         end
233:       end
234:       return unpacked_result
235:     end
unicode_decompose_hangul(codepoint) click to toggle source
     # File lib/addressable/idna.rb, line 238
238:     def self.unicode_decompose_hangul(codepoint)
239:       sindex = codepoint - HANGUL_SBASE;
240:       if sindex < 0 || sindex >= HANGUL_SCOUNT
241:         l = codepoint
242:         v = t = nil
243:         return l, v, t
244:       end
245:       l = HANGUL_LBASE + sindex / HANGUL_NCOUNT
246:       v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
247:       t = HANGUL_TBASE + sindex % HANGUL_TCOUNT
248:       if t == HANGUL_TBASE
249:         t = nil
250:       end
251:       return l, v, t
252:     end
unicode_downcase(input) click to toggle source

Unicode aware downcase method.

@api private @param [String] input

  The input string.

@return [String] The downcased result.

     # File lib/addressable/idna.rb, line 103
103:     def self.unicode_downcase(input)
104:       unpacked = input.unpack("U*")
105:       unpacked.map! { |codepoint| lookup_unicode_lowercase(codepoint) }
106:       return unpacked.pack("U*")
107:     end
unicode_sort_canonical(unpacked) click to toggle source
     # File lib/addressable/idna.rb, line 193
193:     def self.unicode_sort_canonical(unpacked)
194:       unpacked = unpacked.dup
195:       i = 1
196:       length = unpacked.length
197: 
198:       return unpacked if length < 2
199: 
200:       while i < length
201:         last = unpacked[i-1]
202:         ch = unpacked[i]
203:         last_cc = lookup_unicode_combining_class(last)
204:         cc = lookup_unicode_combining_class(ch)
205:         if cc != 0 && last_cc != 0 && last_cc > cc
206:           unpacked[i] = last
207:           unpacked[i-1] = ch
208:           i -= 1 if i > 1
209:         else
210:           i += 1
211:         end
212:       end
213:       return unpacked
214:     end

Disabled; run with --debug to generate this.

[Validate]

Generated with the Darkfish Rdoc Generator 1.1.6.