Segmenter.cpp 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. /*
  2. * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/BinarySearch.h>
  7. #include <AK/Utf16View.h>
  8. #include <LibJS/Runtime/Intl/Segmenter.h>
  9. #include <LibUnicode/CharacterTypes.h>
  10. namespace JS::Intl {
  11. // 18 Segmenter Objects, https://tc39.es/ecma402/#segmenter-objects
  12. Segmenter::Segmenter(Object& prototype)
  13. : Object(prototype)
  14. {
  15. }
  16. void Segmenter::set_segmenter_granularity(StringView segmenter_granularity)
  17. {
  18. if (segmenter_granularity == "grapheme"sv)
  19. m_segmenter_granularity = SegmenterGranularity::Grapheme;
  20. else if (segmenter_granularity == "word"sv)
  21. m_segmenter_granularity = SegmenterGranularity::Word;
  22. else if (segmenter_granularity == "sentence"sv)
  23. m_segmenter_granularity = SegmenterGranularity::Sentence;
  24. else
  25. VERIFY_NOT_REACHED();
  26. }
  27. StringView Segmenter::segmenter_granularity_string() const
  28. {
  29. switch (m_segmenter_granularity) {
  30. case SegmenterGranularity::Grapheme:
  31. return "grapheme"sv;
  32. case SegmenterGranularity::Word:
  33. return "word"sv;
  34. case SegmenterGranularity::Sentence:
  35. return "sentence"sv;
  36. default:
  37. VERIFY_NOT_REACHED();
  38. }
  39. }
  40. // 18.8.1 FindBoundary ( segmenter, string, startIndex, direction ), https://tc39.es/ecma402/#sec-findboundary
  41. double find_boundary(Segmenter const& segmenter, Utf16View const& string, double start_index, Direction direction, Optional<Vector<size_t>>& boundaries_cache)
  42. {
  43. // 1. Let locale be segmenter.[[Locale]].
  44. auto const& locale = segmenter.locale();
  45. // 2. Let granularity be segmenter.[[SegmenterGranularity]].
  46. auto granularity = segmenter.segmenter_granularity();
  47. // 3. Let len be the length of string.
  48. auto length = string.length_in_code_units();
  49. // Non-standard, populate boundaries cache
  50. if (!boundaries_cache.has_value()) {
  51. switch (granularity) {
  52. case Segmenter::SegmenterGranularity::Grapheme:
  53. boundaries_cache = Unicode::find_grapheme_segmentation_boundaries(string);
  54. break;
  55. case Segmenter::SegmenterGranularity::Word:
  56. boundaries_cache = Unicode::find_word_segmentation_boundaries(string);
  57. break;
  58. case Segmenter::SegmenterGranularity::Sentence:
  59. boundaries_cache = Unicode::find_sentence_segmentation_boundaries(string);
  60. break;
  61. default:
  62. VERIFY_NOT_REACHED();
  63. }
  64. }
  65. (void)locale; // TODO: Support locale-sensitive boundaries
  66. // 4. If direction is before, then
  67. if (direction == Direction::Before) {
  68. // a. Assert: startIndex ≥ 0.
  69. VERIFY(start_index >= 0);
  70. // b. Assert: startIndex < len.
  71. VERIFY(start_index < length);
  72. // c. Search string for the last segmentation boundary that is preceded by at most startIndex code units from the beginning, using locale locale and text element granularity granularity.
  73. size_t boundary_index;
  74. binary_search(*boundaries_cache, start_index, &boundary_index);
  75. // d. If a boundary is found, return the count of code units in string preceding it.
  76. if (boundary_index < boundaries_cache->size())
  77. return boundaries_cache->at(boundary_index);
  78. // e. Return 0.
  79. return 0;
  80. }
  81. // 5. Assert: direction is after.
  82. VERIFY(direction == Direction::After);
  83. // 6. If len is 0 or startIndex ≥ len, return +∞.
  84. if (length == 0 || start_index >= length)
  85. return INFINITY;
  86. // 7. Search string for the first segmentation boundary that follows the code unit at index startIndex, using locale locale and text element granularity granularity.
  87. size_t boundary_index;
  88. binary_search(*boundaries_cache, start_index, &boundary_index);
  89. ++boundary_index;
  90. // 8. If a boundary is found, return the count of code units in string preceding it.
  91. if (boundary_index < boundaries_cache->size())
  92. return boundaries_cache->at(boundary_index);
  93. // 9. Return len.
  94. return length;
  95. }
  96. }