4 * License : The MIT License
5 * Copyright(c) 2009 olyutorskii
8 package jp.sourceforge.jindolf.parser;
10 import java.util.regex.Pattern;
11 import jp.sourceforge.jindolf.corelib.PeriodType;
12 import jp.sourceforge.jindolf.corelib.VillageState;
15 * 人狼BBS各種XHTML文字列のパースを行いハンドラに通知する。
17 public class HtmlParser extends AbstractParser{
19 private static final String SP = "\u0020";
22 private BasicHandler basicHandler;
23 private final TalkParser talkParser = new TalkParser(this);
24 private final SysEventParser sysEventParser = new SysEventParser(this);
26 private final SeqRange rangepool_1 = new SeqRange();
27 private final SeqRange rangepool_2 = new SeqRange();
38 * {@link BasicHandler}ハンドラを登録する。
39 * @param basicHandler ハンドラ
41 public void setBasicHandler(BasicHandler basicHandler){
42 this.basicHandler = basicHandler;
47 * {@link TalkHandler}ハンドラを登録する。
48 * @param talkHandler ハンドラ
50 public void setTalkHandler(TalkHandler talkHandler){
51 this.talkParser.setTalkHandler(talkHandler);
56 * {@link SysEventHandler}ハンドラを登録する。
59 public void setSysEventHandler(SysEventHandler handler){
60 this.sysEventParser.setSysEventHandler(handler);
64 private static final Pattern XMLDECL_PATTERN =
65 compile("<\\?xml\u0020");
66 private static final Pattern O_HTML_PATTERN =
67 compile("<html\u0020");
68 private static final Pattern TITLE_PATTERN =
69 compile("<title>([^<]*)</title>");
70 private static final Pattern O_BODY_PATTERN =
72 private static final Pattern O_DIVMAIN_PATTERN =
73 compile("<div\u0020class=\"main\">");
77 * @throws HtmlParseException パースエラー
79 private void parseHead() throws HtmlParseException{
80 setContextErrorMessage("lost head part");
82 SeqRange titleRange = this.rangepool_1;
84 lookingAtAffirm(XMLDECL_PATTERN);
87 findAffirm(O_HTML_PATTERN);
90 findAffirm(TITLE_PATTERN);
91 titleRange.setLastMatchedGroupRange(getMatcher(), 1);
94 this.basicHandler.pageTitle(getContent(), titleRange);
96 findAffirm(O_BODY_PATTERN);
99 findAffirm(O_DIVMAIN_PATTERN);
105 private static final Pattern LOGINFORM_PATTERN =
109 +SP + "action=\"index\\.rb\""
110 +SP + "method=\"post\""
111 +SP + "class=\"login_form\""
115 +SP + "class=\"login_form\""
119 private static final Pattern C_EDIV_PATTERN =
122 + "<a\u0020href=\"[^\"]*\">[^<]*</a>"
126 private static final Pattern USERID_PATTERN =
130 + "value=\"([^\"]*)\""
132 private static final Pattern C_FORM_PATTERN =
137 * ログイン名までの認識を確認したのはF国のみ。
138 * @throws HtmlParseException パースエラー
140 private void parseLoginForm() throws HtmlParseException{
141 setContextErrorMessage("lost login form");
143 SeqRange accountRange = this.rangepool_1;
145 boolean isLand_E_Form;
146 findAffirm(LOGINFORM_PATTERN);
147 if(isGroupMatched(1)){
148 isLand_E_Form = false;
149 }else{ // E国ログインフォーム検出
150 isLand_E_Form = true;
155 lookingAtAffirm(C_EDIV_PATTERN);
159 findAffirm(USERID_PATTERN);
160 accountRange.setLastMatchedGroupRange(getMatcher(), 1);
163 if(accountRange.length() > 0){
165 .loginName(getContent(), accountRange);
168 findAffirm(C_FORM_PATTERN);
175 private static final Pattern VILLAGEINFO_PATTERN =
177 "([^<]+?)" +SP_I // 最短一致数量子
184 +"(?:(?:(午前)|(午後))\u0020)?" // AMPM
195 * @throws HtmlParseException パースエラー
197 private void parseVillageInfo() throws HtmlParseException{
198 setContextErrorMessage("lose village information");
200 SeqRange villageRange = this.rangepool_1;
204 lookingAtAffirm(VILLAGEINFO_PATTERN);
205 villageRange.setLastMatchedGroupRange(getMatcher(), 1);
207 int month = parseGroupedInt(2);
208 int day = parseGroupedInt(3);
209 int hour = parseGroupedInt(6);
210 int minute = parseGroupedInt(7);
211 if(isGroupMatched(5)){ // 午後指定
212 hour = (hour + 12) % 24;
216 this.basicHandler.villageName(getContent(), villageRange);
217 this.basicHandler.commitTime(month, day, hour, minute);
222 private static final Pattern O_PARAG_PATTERN = compile("<p>");
223 private static final Pattern PERIODLINK_PATTERN =
226 + "<span\u0020class=\"time\">"
228 + "<a\u0020href=\"([^\"]*)\">"
233 private static final Pattern PERIOD_PATTERN =
243 private static final Pattern C_SPAN_PATTERN = compile("</span>");
244 private static final Pattern C_ANCHOR_PATTERN = compile("</a>");
248 * @throws HtmlParseException パースエラー
250 private void parsePeriodLink() throws HtmlParseException{
251 setContextErrorMessage("lost period link");
253 SeqRange anchorRange = this.rangepool_1;
255 findAffirm(O_PARAG_PATTERN);
259 Pattern closePattern;
260 anchorRange.setInvalid();
263 lookingAtAffirm(PERIODLINK_PATTERN);
264 if(isGroupMatched(1)){
265 closePattern = C_SPAN_PATTERN;
266 }else if(isGroupMatched(2)){
267 closePattern = C_ANCHOR_PATTERN;
268 anchorRange.setLastMatchedGroupRange(getMatcher(), 2);
269 }else if(isGroupMatched(3)){
274 throw buildParseException();
279 PeriodType periodType = null;
280 lookingAtAffirm(PERIOD_PATTERN);
281 if(isGroupMatched(1)){
282 periodType = PeriodType.PROLOGUE;
283 }else if(isGroupMatched(2)){
284 periodType = PeriodType.EPILOGUE;
285 }else if(isGroupMatched(3)){
287 }else if(isGroupMatched(4)){
288 periodType = PeriodType.PROGRESS;
289 day = parseGroupedInt(4);
292 throw buildParseException();
296 lookingAtAffirm(closePattern);
299 this.basicHandler.periodLink(getContent(),
307 private static final Pattern O_MESSAGE_PATTERN =
308 compile("<div\u0020class=\"message(?:\u0020ch[0-9]+)?\">");
309 private static final Pattern O_RELOAD_PATTERN =
310 compile("<div\u0020id=\"reload\">");
311 private static final Pattern O_MSGKIND_PATTERN =
314 +"<div\u0020class=\"(?:(announce)|(order)|(extra))\">"
317 +"(?:<a name=\"[^\"]*\">)?"
319 +"<span\u0020class=\"mes_no\">"
325 +"<a\u0020name=\"([^\"]*)\"(?:\u0020class=\"ch_name\")?>"
328 private static final Pattern C_DIV_PATTERN = compile("</div>");
332 * @throws HtmlParseException パースエラー
334 private void parseMessage() throws HtmlParseException{
335 setContextErrorMessage("lost message");
337 boolean skipGarbage = true;
345 matched = findProbe(O_MESSAGE_PATTERN); // 最初の1回のみ
347 matched = lookingAtProbe(O_MESSAGE_PATTERN);
350 matched = lookingAtProbe(O_RELOAD_PATTERN);
353 findAffirm(C_DIV_PATTERN);
363 lookingAtAffirm(C_DIV_PATTERN);
371 * イベント種別によって処理を振り分ける。
372 * @throws HtmlParseException パースエラー
374 private void dispatchFamily() throws HtmlParseException{
377 SeqRange nameRange = this.rangepool_1;
379 lookingAtAffirm(O_MSGKIND_PATTERN);
380 if(isGroupMatched(1)){
382 this.sysEventParser.parseAnnounce();
383 }else if(isGroupMatched(2)){
385 this.sysEventParser.parseOrder();
386 }else if(isGroupMatched(3)){
388 this.sysEventParser.parseExtra();
389 }else if(isGroupMatched(5)){
390 nameRange.setLastMatchedGroupRange(getMatcher(), 5);
392 if(isGroupMatched(4)){
393 talkNo = parseGroupedInt(4);
396 this.talkParser.parseTalk(talkNo, nameRange);
399 throw buildParseException();
405 private static final Pattern O_LISTTABLE_PATTERN =
406 compile("<table\u0020class=\"list\">"
415 private static final Pattern ACTIVEVILLAGE =
421 +"<a\u0020href=\"([^\"]*)\">([^<]*)</a>"
422 +"(?:\u0020|</td><td>(?:通常|[^<]*)</td><td>)"
424 +"(?:\uff08(?:(午前)|(午後))\u0020)?" // AMPM
428 +"(?:\u0020|分\u0020更新\uff09)"
435 +"|(勝敗が決定しました。|エピローグ)"
444 * @throws HtmlParseException パースエラー
446 private void parseTopList() throws HtmlParseException{
447 setContextErrorMessage("lost village list");
449 SeqRange anchorRange = this.rangepool_1;
450 SeqRange villageRange = this.rangepool_2;
452 if( ! findProbe(O_LISTTABLE_PATTERN) ) return;
457 lookingAtAffirm(ACTIVEVILLAGE);
458 if(isGroupMatched(1)) break;
459 anchorRange .setLastMatchedGroupRange(getMatcher(), 2);
460 villageRange.setLastMatchedGroupRange(getMatcher(), 3);
461 int hour = parseGroupedInt(6);
462 if(isGroupMatched(5)){
463 hour = (hour + 12) % 24;
465 int minute = parseGroupedInt(7);
468 if(isGroupMatched(8)){
469 state = VillageState.PROLOGUE;
470 }else if(isGroupMatched(9)){
471 state = VillageState.PROLOGUE;
472 }else if(isGroupMatched(10)){
473 state = VillageState.PROGRESS;
474 }else if(isGroupMatched(11)){
475 state = VillageState.EPILOGUE;
476 }else if(isGroupMatched(12)){
477 state = VillageState.GAMEOVER;
480 throw buildParseException();
487 this.basicHandler.villageRecord(getContent(),
497 private static final Pattern O_LISTLOG_PATTERN =
499 "<a\u0020href=\"(index[^\"]*(?:ready_0|000_ready))\">"
506 * @throws HtmlParseException パースエラー
508 private void parseLogList() throws HtmlParseException{
509 setContextErrorMessage("lost village list");
511 SeqRange anchorRange = this.rangepool_1;
512 SeqRange villageRange = this.rangepool_2;
514 boolean is1st = true;
518 matched = findProbe(O_LISTLOG_PATTERN);
521 matched = lookingAtProbe(O_LISTLOG_PATTERN);
523 if( ! matched ) break;
525 anchorRange .setLastMatchedGroupRange(getMatcher(), 1);
526 villageRange.setLastMatchedGroupRange(getMatcher(), 2);
530 this.basicHandler.villageRecord(getContent(),
534 VillageState.GAMEOVER );
540 private static final Pattern C_BODY_PATTERN =
542 private static final Pattern C_HTML_PATTERN =
543 compile(SP_I+ "</html>" +SP_I);
547 * @throws HtmlParseException パースエラー
549 private void parseTail() throws HtmlParseException{
550 setContextErrorMessage("lost last part");
552 findAffirm(C_BODY_PATTERN);
555 matchesAffirm(C_HTML_PATTERN);
561 private static final Pattern LISTTITLE_PATTERN =
565 * 人狼BBSのページ種別を自動認識しつつパースする。
566 * @param content パース対象の文字列
567 * @throws HtmlParseException パースエラー
569 public void parseAutomatic(DecodedContent content)
570 throws HtmlParseException{
573 this.basicHandler.startParse(getContent());
579 if(lookingAtProbe(LISTTITLE_PATTERN)){
581 this.basicHandler.pageType(PageType.VILLAGELIST_PAGE);
586 if(lookingAtProbe(O_PARAG_PATTERN)){
588 this.basicHandler.pageType(PageType.TOP_PAGE);
591 this.basicHandler.pageType(PageType.PERIOD_PAGE);
600 this.basicHandler.endParse();