Parser.cpp 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. #include <LibHTML/DOM/Element.h>
  2. #include <LibHTML/DOM/Text.h>
  3. #include <LibHTML/Parser/Parser.h>
  4. #include <ctype.h>
  5. #include <stdio.h>
  6. static Retained<Element> create_element(const String& tag_name)
  7. {
  8. return adopt(*new Element(tag_name));
  9. }
  10. static bool is_valid_in_attribute_name(char ch)
  11. {
  12. return isalnum(ch) || ch == '_' || ch == '-';
  13. }
  14. static bool is_self_closing_tag(const String& tag_name)
  15. {
  16. return tag_name == "area"
  17. || tag_name == "base"
  18. || tag_name == "br"
  19. || tag_name == "col"
  20. || tag_name == "embed"
  21. || tag_name == "hr"
  22. || tag_name == "img"
  23. || tag_name == "input"
  24. || tag_name == "link"
  25. || tag_name == "meta"
  26. || tag_name == "param"
  27. || tag_name == "source"
  28. || tag_name == "track"
  29. || tag_name == "wbr";
  30. }
  31. Retained<Document> parse(const String& html)
  32. {
  33. Vector<Retained<ParentNode>> node_stack;
  34. auto doc = adopt(*new Document);
  35. node_stack.append(doc);
  36. enum class State {
  37. Free = 0,
  38. BeforeTagName,
  39. InTagName,
  40. InAttributeList,
  41. InAttributeName,
  42. BeforeAttributeValue,
  43. InAttributeValueNoQuote,
  44. InAttributeValueSingleQuote,
  45. InAttributeValueDoubleQuote,
  46. };
  47. auto state = State::Free;
  48. Vector<char, 256> text_buffer;
  49. Vector<char, 32> tag_name_buffer;
  50. Vector<Attribute> attributes;
  51. Vector<char, 256> attribute_name_buffer;
  52. Vector<char, 256> attribute_value_buffer;
  53. bool is_slash_tag = false;
  54. auto move_to_state = [&](State new_state) {
  55. if (new_state == State::BeforeTagName) {
  56. is_slash_tag = false;
  57. tag_name_buffer.clear();
  58. attributes.clear();
  59. }
  60. if (new_state == State::InAttributeName)
  61. attribute_name_buffer.clear();
  62. if (new_state == State::BeforeAttributeValue)
  63. attribute_value_buffer.clear();
  64. if (state == State::Free && !text_buffer.is_empty()) {
  65. auto text_node = adopt(*new Text(String::copy(text_buffer)));
  66. text_buffer.clear();
  67. node_stack.last()->append_child(text_node);
  68. }
  69. state = new_state;
  70. text_buffer.clear();
  71. };
  72. auto close_tag = [&] {
  73. if (node_stack.size() > 1)
  74. node_stack.take_last();
  75. };
  76. auto open_tag = [&] {
  77. auto new_element = create_element(String::copy(tag_name_buffer));
  78. tag_name_buffer.clear();
  79. new_element->set_attributes(move(attributes));
  80. node_stack.append(new_element);
  81. if (node_stack.size() != 1)
  82. node_stack[node_stack.size() - 2]->append_child(new_element);
  83. if (is_self_closing_tag(new_element->tag_name()))
  84. close_tag();
  85. };
  86. auto commit_tag = [&] {
  87. if (is_slash_tag)
  88. close_tag();
  89. else
  90. open_tag();
  91. };
  92. auto commit_attribute = [&] {
  93. attributes.append({ String::copy(attribute_name_buffer), String::copy(attribute_value_buffer) });
  94. };
  95. for (int i = 0; i < html.length(); ++i) {
  96. char ch = html[i];
  97. switch (state) {
  98. case State::Free:
  99. if (ch == '<') {
  100. is_slash_tag = false;
  101. move_to_state(State::BeforeTagName);
  102. break;
  103. }
  104. text_buffer.append(ch);
  105. break;
  106. case State::BeforeTagName:
  107. if (ch == '/') {
  108. is_slash_tag = true;
  109. break;
  110. }
  111. if (ch == '>') {
  112. move_to_state(State::Free);
  113. break;
  114. }
  115. if (!isalpha(ch))
  116. break;
  117. move_to_state(State::InTagName);
  118. [[fallthrough]];
  119. case State::InTagName:
  120. if (isspace(ch)) {
  121. move_to_state(State::InAttributeList);
  122. break;
  123. }
  124. if (ch == '>') {
  125. commit_tag();
  126. move_to_state(State::Free);
  127. break;
  128. }
  129. tag_name_buffer.append(ch);
  130. break;
  131. case State::InAttributeList:
  132. if (ch == '>') {
  133. commit_tag();
  134. move_to_state(State::Free);
  135. break;
  136. }
  137. if (!isalpha(ch))
  138. break;
  139. move_to_state(State::InAttributeName);
  140. [[fallthrough]];
  141. case State::InAttributeName:
  142. if (is_valid_in_attribute_name(ch)) {
  143. attribute_name_buffer.append(ch);
  144. break;
  145. }
  146. if (isspace(ch)) {
  147. commit_attribute();
  148. break;
  149. }
  150. if (ch == '>') {
  151. commit_tag();
  152. move_to_state(State::Free);
  153. break;
  154. }
  155. if (ch == '=') {
  156. move_to_state(State::BeforeAttributeValue);
  157. break;
  158. }
  159. break;
  160. case State::BeforeAttributeValue:
  161. if (ch == '\'') {
  162. move_to_state(State::InAttributeValueSingleQuote);
  163. break;
  164. }
  165. if (ch == '"') {
  166. move_to_state(State::InAttributeValueDoubleQuote);
  167. break;
  168. }
  169. if (ch == '>') {
  170. commit_tag();
  171. move_to_state(State::Free);
  172. break;
  173. }
  174. if (isspace(ch)) {
  175. commit_attribute();
  176. move_to_state(State::InAttributeList);
  177. break;
  178. }
  179. break;
  180. case State::InAttributeValueSingleQuote:
  181. if (ch == '\'') {
  182. commit_attribute();
  183. move_to_state(State::InAttributeList);
  184. break;
  185. }
  186. attribute_value_buffer.append(ch);
  187. break;
  188. case State::InAttributeValueDoubleQuote:
  189. if (ch == '"') {
  190. commit_attribute();
  191. move_to_state(State::InAttributeList);
  192. break;
  193. }
  194. attribute_value_buffer.append(ch);
  195. break;
  196. case State::InAttributeValueNoQuote:
  197. if (isspace(ch)) {
  198. commit_attribute();
  199. move_to_state(State::InAttributeList);
  200. break;
  201. }
  202. if (ch == '>') {
  203. commit_tag();
  204. move_to_state(State::Free);
  205. break;
  206. }
  207. attribute_value_buffer.append(ch);
  208. break;
  209. default:
  210. fprintf(stderr, "Unhandled state %d\n", (int)state);
  211. ASSERT_NOT_REACHED();
  212. }
  213. }
  214. return doc;
  215. }