Segmenter.cpp 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. /*
  2. * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/BinarySearch.h>
  7. #include <AK/Utf16View.h>
  8. #include <LibJS/Runtime/GlobalObject.h>
  9. #include <LibJS/Runtime/Intl/Segmenter.h>
  10. #include <LibUnicode/CharacterTypes.h>
  11. namespace JS::Intl {
  12. // 18 Segmenter Objects, https://tc39.es/ecma402/#segmenter-objects
  13. Segmenter::Segmenter(Object& prototype)
  14. : Object(prototype)
  15. {
  16. }
  17. void Segmenter::set_segmenter_granularity(StringView segmenter_granularity)
  18. {
  19. if (segmenter_granularity == "grapheme"sv)
  20. m_segmenter_granularity = SegmenterGranularity::Grapheme;
  21. else if (segmenter_granularity == "word"sv)
  22. m_segmenter_granularity = SegmenterGranularity::Word;
  23. else if (segmenter_granularity == "sentence"sv)
  24. m_segmenter_granularity = SegmenterGranularity::Sentence;
  25. else
  26. VERIFY_NOT_REACHED();
  27. }
  28. StringView Segmenter::segmenter_granularity_string() const
  29. {
  30. switch (m_segmenter_granularity) {
  31. case SegmenterGranularity::Grapheme:
  32. return "grapheme"sv;
  33. case SegmenterGranularity::Word:
  34. return "word"sv;
  35. case SegmenterGranularity::Sentence:
  36. return "sentence"sv;
  37. default:
  38. VERIFY_NOT_REACHED();
  39. }
  40. }
  41. // 18.7.1 CreateSegmentDataObject ( segmenter, string, startIndex, endIndex ), https://tc39.es/ecma402/#sec-createsegmentdataobject
  42. Object* create_segment_data_object(VM& vm, Segmenter const& segmenter, Utf16View const& string, double start_index, double end_index)
  43. {
  44. auto& realm = *vm.current_realm();
  45. auto& global_object = realm.global_object();
  46. // 1. Let len be the length of string.
  47. auto length = string.length_in_code_units();
  48. // 2. Assert: startIndex ≥ 0.
  49. VERIFY(start_index >= 0);
  50. // 3. Assert: endIndex ≤ len.
  51. VERIFY(end_index <= length);
  52. // 4. Assert: startIndex < endIndex.
  53. VERIFY(start_index < end_index);
  54. // 5. Let result be OrdinaryObjectCreate(%Object.prototype%).
  55. auto* result = Object::create(realm, global_object.object_prototype());
  56. // 6. Let segment be the substring of string from startIndex to endIndex.
  57. auto segment = string.substring_view(start_index, end_index - start_index);
  58. // 7. Perform ! CreateDataPropertyOrThrow(result, "segment", segment).
  59. MUST(result->create_data_property_or_throw(vm.names.segment, js_string(vm, segment)));
  60. // 8. Perform ! CreateDataPropertyOrThrow(result, "index", 𝔽(startIndex)).
  61. MUST(result->create_data_property_or_throw(vm.names.index, Value(start_index)));
  62. // 9. Perform ! CreateDataPropertyOrThrow(result, "input", string).
  63. MUST(result->create_data_property_or_throw(vm.names.input, js_string(vm, string)));
  64. // 10. Let granularity be segmenter.[[SegmenterGranularity]].
  65. auto granularity = segmenter.segmenter_granularity();
  66. // 11. If granularity is "word", then
  67. if (granularity == Segmenter::SegmenterGranularity::Word) {
  68. // a. Let isWordLike be a Boolean value indicating whether the segment in string is "word-like" according to locale segmenter.[[Locale]].
  69. // TODO
  70. // b. Perform ! CreateDataPropertyOrThrow(result, "isWordLike", isWordLike).
  71. MUST(result->create_data_property_or_throw(vm.names.isWordLike, Value(false)));
  72. }
  73. // 12. Return result.
  74. return result;
  75. }
  76. // 18.8.1 FindBoundary ( segmenter, string, startIndex, direction ), https://tc39.es/ecma402/#sec-findboundary
  77. double find_boundary(Segmenter const& segmenter, Utf16View const& string, double start_index, Direction direction, Optional<Vector<size_t>>& boundaries_cache)
  78. {
  79. // 1. Let locale be segmenter.[[Locale]].
  80. auto const& locale = segmenter.locale();
  81. // 2. Let granularity be segmenter.[[SegmenterGranularity]].
  82. auto granularity = segmenter.segmenter_granularity();
  83. // 3. Let len be the length of string.
  84. auto length = string.length_in_code_units();
  85. // Non-standard, populate boundaries cache
  86. if (!boundaries_cache.has_value()) {
  87. switch (granularity) {
  88. case Segmenter::SegmenterGranularity::Grapheme:
  89. boundaries_cache = Unicode::find_grapheme_segmentation_boundaries(string);
  90. break;
  91. case Segmenter::SegmenterGranularity::Word:
  92. boundaries_cache = Unicode::find_word_segmentation_boundaries(string);
  93. break;
  94. case Segmenter::SegmenterGranularity::Sentence:
  95. boundaries_cache = Unicode::find_sentence_segmentation_boundaries(string);
  96. break;
  97. default:
  98. VERIFY_NOT_REACHED();
  99. }
  100. }
  101. (void)locale; // TODO: Support locale-sensitive boundaries
  102. // 4. If direction is before, then
  103. if (direction == Direction::Before) {
  104. // a. Assert: startIndex ≥ 0.
  105. VERIFY(start_index >= 0);
  106. // b. Assert: startIndex < len.
  107. VERIFY(start_index < length);
  108. // c. Search string for the last segmentation boundary that is preceded by at most startIndex code units from the beginning, using locale locale and text element granularity granularity.
  109. size_t boundary_index;
  110. binary_search(*boundaries_cache, start_index, &boundary_index);
  111. // d. If a boundary is found, return the count of code units in string preceding it.
  112. if (boundary_index < boundaries_cache->size())
  113. return boundaries_cache->at(boundary_index);
  114. // e. Return 0.
  115. return 0;
  116. }
  117. // 5. Assert: direction is after.
  118. VERIFY(direction == Direction::After);
  119. // 6. If len is 0 or startIndex ≥ len, return +∞.
  120. if (length == 0 || start_index >= length)
  121. return INFINITY;
  122. // 7. Search string for the first segmentation boundary that follows the code unit at index startIndex, using locale locale and text element granularity granularity.
  123. size_t boundary_index;
  124. binary_search(*boundaries_cache, start_index, &boundary_index);
  125. ++boundary_index;
  126. // 8. If a boundary is found, return the count of code units in string preceding it.
  127. if (boundary_index < boundaries_cache->size())
  128. return boundaries_cache->at(boundary_index);
  129. // 9. Return len.
  130. return length;
  131. }
  132. }