Segmenter.cpp 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. /*
  2. * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
  3. * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/Utf16View.h>
  8. #include <LibJS/Runtime/GlobalObject.h>
  9. #include <LibJS/Runtime/Intl/Segmenter.h>
  10. #include <LibUnicode/Segmentation.h>
  11. namespace JS::Intl {
  12. // 18 Segmenter Objects, https://tc39.es/ecma402/#segmenter-objects
  13. Segmenter::Segmenter(Object& prototype)
  14. : Object(ConstructWithPrototypeTag::Tag, prototype)
  15. {
  16. }
  17. void Segmenter::set_segmenter_granularity(StringView segmenter_granularity)
  18. {
  19. if (segmenter_granularity == "grapheme"sv)
  20. m_segmenter_granularity = SegmenterGranularity::Grapheme;
  21. else if (segmenter_granularity == "word"sv)
  22. m_segmenter_granularity = SegmenterGranularity::Word;
  23. else if (segmenter_granularity == "sentence"sv)
  24. m_segmenter_granularity = SegmenterGranularity::Sentence;
  25. else
  26. VERIFY_NOT_REACHED();
  27. }
  28. StringView Segmenter::segmenter_granularity_string() const
  29. {
  30. switch (m_segmenter_granularity) {
  31. case SegmenterGranularity::Grapheme:
  32. return "grapheme"sv;
  33. case SegmenterGranularity::Word:
  34. return "word"sv;
  35. case SegmenterGranularity::Sentence:
  36. return "sentence"sv;
  37. default:
  38. VERIFY_NOT_REACHED();
  39. }
  40. }
  41. // 18.7.1 CreateSegmentDataObject ( segmenter, string, startIndex, endIndex ), https://tc39.es/ecma402/#sec-createsegmentdataobject
  42. ThrowCompletionOr<NonnullGCPtr<Object>> create_segment_data_object(VM& vm, Segmenter const& segmenter, Utf16View const& string, double start_index, double end_index)
  43. {
  44. auto& realm = *vm.current_realm();
  45. // 1. Let len be the length of string.
  46. auto length = string.length_in_code_units();
  47. // 2. Assert: startIndex ≥ 0.
  48. VERIFY(start_index >= 0);
  49. // 3. Assert: endIndex ≤ len.
  50. VERIFY(end_index <= length);
  51. // 4. Assert: startIndex < endIndex.
  52. VERIFY(start_index < end_index);
  53. // 5. Let result be OrdinaryObjectCreate(%Object.prototype%).
  54. auto result = Object::create(realm, realm.intrinsics().object_prototype());
  55. // 6. Let segment be the substring of string from startIndex to endIndex.
  56. auto segment = string.substring_view(start_index, end_index - start_index);
  57. // 7. Perform ! CreateDataPropertyOrThrow(result, "segment", segment).
  58. MUST(result->create_data_property_or_throw(vm.names.segment, PrimitiveString::create(vm, Utf16String::create(segment))));
  59. // 8. Perform ! CreateDataPropertyOrThrow(result, "index", 𝔽(startIndex)).
  60. MUST(result->create_data_property_or_throw(vm.names.index, Value(start_index)));
  61. // 9. Perform ! CreateDataPropertyOrThrow(result, "input", string).
  62. MUST(result->create_data_property_or_throw(vm.names.input, PrimitiveString::create(vm, Utf16String::create(string))));
  63. // 10. Let granularity be segmenter.[[SegmenterGranularity]].
  64. auto granularity = segmenter.segmenter_granularity();
  65. // 11. If granularity is "word", then
  66. if (granularity == Segmenter::SegmenterGranularity::Word) {
  67. // a. Let isWordLike be a Boolean value indicating whether the segment in string is "word-like" according to locale segmenter.[[Locale]].
  68. // TODO
  69. // b. Perform ! CreateDataPropertyOrThrow(result, "isWordLike", isWordLike).
  70. MUST(result->create_data_property_or_throw(vm.names.isWordLike, Value(false)));
  71. }
  72. // 12. Return result.
  73. return result;
  74. }
  75. static Optional<size_t> find_previous_boundary_index(Utf16View const& string, size_t index, Segmenter::SegmenterGranularity granularity)
  76. {
  77. switch (granularity) {
  78. case Segmenter::SegmenterGranularity::Grapheme:
  79. return Unicode::previous_grapheme_segmentation_boundary(string, index);
  80. case Segmenter::SegmenterGranularity::Word:
  81. return Unicode::previous_word_segmentation_boundary(string, index);
  82. case Segmenter::SegmenterGranularity::Sentence:
  83. return Unicode::previous_sentence_segmentation_boundary(string, index);
  84. }
  85. VERIFY_NOT_REACHED();
  86. }
  87. static Optional<size_t> find_next_boundary_index(Utf16View const& string, size_t index, Segmenter::SegmenterGranularity granularity)
  88. {
  89. switch (granularity) {
  90. case Segmenter::SegmenterGranularity::Grapheme:
  91. return Unicode::next_grapheme_segmentation_boundary(string, index);
  92. case Segmenter::SegmenterGranularity::Word:
  93. return Unicode::next_word_segmentation_boundary(string, index);
  94. case Segmenter::SegmenterGranularity::Sentence:
  95. return Unicode::next_sentence_segmentation_boundary(string, index);
  96. }
  97. VERIFY_NOT_REACHED();
  98. }
  99. // 18.8.1 FindBoundary ( segmenter, string, startIndex, direction ), https://tc39.es/ecma402/#sec-findboundary
  100. double find_boundary(Segmenter const& segmenter, Utf16View const& string, double start_index, Direction direction)
  101. {
  102. // 1. Let locale be segmenter.[[Locale]].
  103. // FIXME: Support locale-sensitive boundaries
  104. // 2. Let granularity be segmenter.[[SegmenterGranularity]].
  105. auto granularity = segmenter.segmenter_granularity();
  106. // 3. Let len be the length of string.
  107. auto length = string.length_in_code_units();
  108. // 4. If direction is before, then
  109. if (direction == Direction::Before) {
  110. // a. Assert: startIndex ≥ 0.
  111. VERIFY(start_index >= 0);
  112. // b. Assert: startIndex < len.
  113. VERIFY(start_index < length);
  114. // c. Search string for the last segmentation boundary that is preceded by at most startIndex code units from the beginning, using locale locale and text element granularity granularity.
  115. auto boundary_index = find_previous_boundary_index(string, static_cast<size_t>(start_index) + 1, granularity);
  116. // d. If a boundary is found, return the count of code units in string preceding it.
  117. if (boundary_index.has_value())
  118. return static_cast<double>(*boundary_index);
  119. // e. Return 0.
  120. return 0;
  121. }
  122. // 5. Assert: direction is after.
  123. VERIFY(direction == Direction::After);
  124. // 6. If len is 0 or startIndex ≥ len, return +∞.
  125. if (length == 0 || start_index >= length)
  126. return INFINITY;
  127. // 7. Search string for the first segmentation boundary that follows the code unit at index startIndex, using locale locale and text element granularity granularity.
  128. auto boundary_index = find_next_boundary_index(string, static_cast<size_t>(start_index), granularity);
  129. // 8. If a boundary is found, return the count of code units in string preceding it.
  130. if (boundary_index.has_value())
  131. return static_cast<double>(*boundary_index);
  132. // 9. Return len.
  133. return length;
  134. }
  135. }