OSDN Git Service

1.408.2版リリース準備
[jindolf/JinParser.git] / src / main / java / jp / sourceforge / jindolf / parser / HtmlParser.java
1 /*
2  * XHTML parser
3  *
4  * License : The MIT License
5  * Copyright(c) 2009 olyutorskii
6  */
7
8 package jp.sourceforge.jindolf.parser;
9
10 import java.util.regex.Pattern;
11 import jp.sourceforge.jindolf.corelib.PeriodType;
12 import jp.sourceforge.jindolf.corelib.VillageState;
13
14 /**
15  * 人狼BBS各種XHTML文字列のパースを行いハンドラに通知する。
16  */
17 public class HtmlParser extends AbstractParser{
18
19     private static final String SP = "\u0020";
20
21
22     private BasicHandler basicHandler;
23     private final TalkParser     talkParser     = new TalkParser(this);
24     private final SysEventParser sysEventParser = new SysEventParser(this);
25
26     private final SeqRange rangepool_1 = new SeqRange();
27     private final SeqRange rangepool_2 = new SeqRange();
28
29     /**
30      * コンストラクタ。
31      */
32     public HtmlParser(){
33         super();
34         return;
35     }
36
37     /**
38      * {@link BasicHandler}ハンドラを登録する。
39      * @param basicHandler ハンドラ
40      */
41     public void setBasicHandler(BasicHandler basicHandler){
42         this.basicHandler = basicHandler;
43         return;
44     }
45
46     /**
47      * {@link TalkHandler}ハンドラを登録する。
48      * @param talkHandler ハンドラ
49      */
50     public void setTalkHandler(TalkHandler talkHandler){
51         this.talkParser.setTalkHandler(talkHandler);
52         return;
53     }
54
55     /**
56      * {@link SysEventHandler}ハンドラを登録する。
57      * @param handler ハンドラ
58      */
59     public void setSysEventHandler(SysEventHandler handler){
60         this.sysEventParser.setSysEventHandler(handler);
61         return;
62     }
63
64     private static final Pattern XMLDECL_PATTERN =
65             compile("<\\?xml\u0020");
66     private static final Pattern O_HTML_PATTERN =
67             compile("<html\u0020");
68     private static final Pattern TITLE_PATTERN =
69             compile("<title>([^<]*)</title>");
70     private static final Pattern O_BODY_PATTERN =
71             compile("<body>");
72     private static final Pattern O_DIVMAIN_PATTERN =
73             compile("<div\u0020class=\"main\">");
74
75     /**
76      * XHTML先頭部分のパース。
77      * @throws HtmlParseException パースエラー
78      */
79     private void parseHead() throws HtmlParseException{
80         setContextErrorMessage("lost head part");
81
82         SeqRange titleRange = this.rangepool_1;
83
84         lookingAtAffirm(XMLDECL_PATTERN);
85         shrinkRegion();
86
87         findAffirm(O_HTML_PATTERN);
88         shrinkRegion();
89
90         findAffirm(TITLE_PATTERN);
91         titleRange.setLastMatchedGroupRange(getMatcher(), 1);
92         shrinkRegion();
93
94         this.basicHandler.pageTitle(getContent(), titleRange);
95
96         findAffirm(O_BODY_PATTERN);
97         shrinkRegion();
98
99         findAffirm(O_DIVMAIN_PATTERN);
100         shrinkRegion();
101
102         return;
103     }
104
105     private static final Pattern LOGINFORM_PATTERN =
106             compile(
107                   "("
108                     +"<form"
109                     +SP + "action=\"index\\.rb\""
110                     +SP + "method=\"post\""
111                     +SP + "class=\"login_form\""
112                     +">"
113                 + ")|("
114                     +"<div"
115                     +SP + "class=\"login_form\""
116                     +">"
117                 + ")"
118             );
119     private static final Pattern C_EDIV_PATTERN =
120             compile(
121                   SP_I
122                 + "<a\u0020href=\"[^\"]*\">[^<]*</a>"
123                 + SP_I
124                 + "</div>"
125             );
126     private static final Pattern USERID_PATTERN =
127             compile(
128                   "name=\"user_id\""
129                 + SP
130                 + "value=\"([^\"]*)\""
131             );
132     private static final Pattern C_FORM_PATTERN =
133             compile("</form>");
134
135     /**
136      * ログインフォームのパース。
137      * ログイン名までの認識を確認したのはF国のみ。
138      * @throws HtmlParseException パースエラー
139      */
140     private void parseLoginForm() throws HtmlParseException{
141         setContextErrorMessage("lost login form");
142
143         SeqRange accountRange = this.rangepool_1;
144
145         boolean isLand_E_Form;
146         findAffirm(LOGINFORM_PATTERN);
147         if(isGroupMatched(1)){
148             isLand_E_Form = false;
149         }else{                         // E国ログインフォーム検出
150             isLand_E_Form = true;
151         }
152         shrinkRegion();
153
154         if(isLand_E_Form){
155             lookingAtAffirm(C_EDIV_PATTERN);
156             shrinkRegion();
157             return;
158         }else{
159             findAffirm(USERID_PATTERN);
160             accountRange.setLastMatchedGroupRange(getMatcher(), 1);
161             shrinkRegion();
162
163             if(accountRange.length() > 0){
164                 this.basicHandler
165                     .loginName(getContent(), accountRange);
166             }
167
168             findAffirm(C_FORM_PATTERN);
169             shrinkRegion();
170         }
171
172         return;
173     }
174
175     private static final Pattern VILLAGEINFO_PATTERN =
176             compile(
177                  "([^<]+?)" +SP_I          // 最短一致数量子
178                 +"<strong>"
179                     +"\uff08"
180                     +"([0-9]+)"                       // 月
181                     +"/"
182                     +"([0-9]+)"                       // 日
183                     +SP
184                     +"(?:(?:(午前)|(午後))\u0020)?"  // AMPM
185                     +"([0-9]+)"                       // 時
186                     +"(?:時\u0020|\\:)"
187                     +"([0-9]+)"                       // 分
188                     +"分?\u0020に更新"
189                     +"\uff09"
190                 +"</strong>"
191             );
192
193     /**
194      * 村に関する各種情報をパース。
195      * @throws HtmlParseException パースエラー
196      */
197     private void parseVillageInfo() throws HtmlParseException{
198         setContextErrorMessage("lose village information");
199
200         SeqRange villageRange = this.rangepool_1;
201
202         sweepSpace();
203
204         lookingAtAffirm(VILLAGEINFO_PATTERN);
205         villageRange.setLastMatchedGroupRange(getMatcher(), 1);
206
207         int month  = parseGroupedInt(2);
208         int day    = parseGroupedInt(3);
209         int hour   = parseGroupedInt(6);
210         int minute = parseGroupedInt(7);
211         if(isGroupMatched(5)){  // 午後指定
212             hour = (hour + 12) % 24;
213         }
214         shrinkRegion();
215
216         this.basicHandler.villageName(getContent(), villageRange);
217         this.basicHandler.commitTime(month, day, hour, minute);
218
219         return;
220     }
221
222     private static final Pattern O_PARAG_PATTERN = compile("<p>");
223     private static final Pattern PERIODLINK_PATTERN =
224             compile(
225             "("
226                 + "<span\u0020class=\"time\">"
227             +")|(?:"
228                 + "<a\u0020href=\"([^\"]*)\">"
229             +")|("
230                 + "</p>"
231             +")"
232             );
233     private static final Pattern PERIOD_PATTERN =
234             compile(
235                 "(プロローグ)" +
236             "|"+
237                 "(エピローグ)" +
238             "|"+
239                 "(終了)" +
240             "|"+
241                 "([0-9]+)日目"
242             );
243     private static final Pattern C_SPAN_PATTERN   = compile("</span>");
244     private static final Pattern C_ANCHOR_PATTERN = compile("</a>");
245
246     /**
247      * Period間リンクをパース。
248      * @throws HtmlParseException パースエラー
249      */
250     private void parsePeriodLink() throws HtmlParseException{
251         setContextErrorMessage("lost period link");
252
253         SeqRange anchorRange = this.rangepool_1;
254
255         findAffirm(O_PARAG_PATTERN);
256         shrinkRegion();
257
258         for(;;){
259             Pattern closePattern;
260             anchorRange.setInvalid();
261
262             sweepSpace();
263             lookingAtAffirm(PERIODLINK_PATTERN);
264             if(isGroupMatched(1)){
265                 closePattern = C_SPAN_PATTERN;
266             }else if(isGroupMatched(2)){
267                 closePattern = C_ANCHOR_PATTERN;
268                 anchorRange.setLastMatchedGroupRange(getMatcher(), 2);
269             }else if(isGroupMatched(3)){
270                 shrinkRegion();
271                 break;
272             }else{
273                 assert false;
274                 throw buildParseException();
275             }
276             shrinkRegion();
277
278             int day = -1;
279             PeriodType periodType = null;
280             lookingAtAffirm(PERIOD_PATTERN);
281             if(isGroupMatched(1)){
282                 periodType = PeriodType.PROLOGUE;
283             }else if(isGroupMatched(2)){
284                 periodType = PeriodType.EPILOGUE;
285             }else if(isGroupMatched(3)){
286                 periodType = null;
287             }else if(isGroupMatched(4)){
288                 periodType = PeriodType.PROGRESS;
289                 day = parseGroupedInt(4);
290             }else{
291                 assert false;
292                 throw buildParseException();
293             }
294             shrinkRegion();
295
296             lookingAtAffirm(closePattern);
297             shrinkRegion();
298
299             this.basicHandler.periodLink(getContent(),
300                                          anchorRange,
301                                          periodType, day );
302         }
303
304         return;
305     }
306
307     private static final Pattern O_MESSAGE_PATTERN =
308             compile("<div\u0020class=\"message(?:\u0020ch[0-9]+)?\">");
309     private static final Pattern O_RELOAD_PATTERN =
310             compile("<div\u0020id=\"reload\">");
311     private static final Pattern O_MSGKIND_PATTERN =
312             compile(
313              "(?:"
314                 +"<div\u0020class=\"(?:(announce)|(order)|(extra))\">"
315             +")|(?:"
316                 +"(?:"
317                 +"(?:<a name=\"[^\"]*\">)?"
318                 +SP_I
319                 +"<span\u0020class=\"mes_no\">"
320                     +"([0-9]+)\\."
321                 +"</span>)?"
322                 +SP_I
323                 +"(?:</a>)?"
324                 +SP_I
325                 +"<a\u0020name=\"([^\"]*)\"(?:\u0020class=\"ch_name\")?>"
326             +")"
327             );
328     private static final Pattern C_DIV_PATTERN = compile("</div>");
329
330     /**
331      * 各種メッセージをパース。
332      * @throws HtmlParseException パースエラー
333      */
334     private void parseMessage() throws HtmlParseException{
335         setContextErrorMessage("lost message");
336
337         boolean skipGarbage = true;
338
339         for(;;){
340             sweepSpace();
341
342             boolean matched;
343             if(skipGarbage){
344                 skipGarbage = false;
345                 matched = findProbe(O_MESSAGE_PATTERN); // 最初の1回のみ
346             }else{
347                 matched = lookingAtProbe(O_MESSAGE_PATTERN);
348             }
349             if( ! matched ){
350                 matched = lookingAtProbe(O_RELOAD_PATTERN);
351                 if(matched){
352                     shrinkRegion();
353                     findAffirm(C_DIV_PATTERN);
354                     shrinkRegion();
355                     continue;
356                 }
357                 break;
358             }
359             shrinkRegion();
360
361             dispatchFamily();
362
363             lookingAtAffirm(C_DIV_PATTERN);
364             shrinkRegion();
365         }
366
367         return;
368     }
369
370     /**
371      * イベント種別によって処理を振り分ける。
372      * @throws HtmlParseException パースエラー
373      */
374     private void dispatchFamily() throws HtmlParseException{
375         sweepSpace();
376
377         SeqRange nameRange = this.rangepool_1;
378
379         lookingAtAffirm(O_MSGKIND_PATTERN);
380         if(isGroupMatched(1)){
381             shrinkRegion();
382             this.sysEventParser.parseAnnounce();
383         }else if(isGroupMatched(2)){
384             shrinkRegion();
385             this.sysEventParser.parseOrder();
386         }else if(isGroupMatched(3)){
387             shrinkRegion();
388             this.sysEventParser.parseExtra();
389         }else if(isGroupMatched(5)){
390             nameRange.setLastMatchedGroupRange(getMatcher(), 5);
391             int talkNo = -1;
392             if(isGroupMatched(4)){
393                 talkNo = parseGroupedInt(4);
394             }
395             shrinkRegion();
396             this.talkParser.parseTalk(talkNo, nameRange);
397         }else{
398             assert false;
399             throw buildParseException();
400         }
401
402         return;
403     }
404
405     private static final Pattern O_LISTTABLE_PATTERN =
406             compile("<table\u0020class=\"list\">"
407                    +"(?:"
408                    +  "<tr>"
409                    +    "<th>村名</th>"
410                    +    "<th>Mode</th>"
411                    +    "<th>更新</th>"
412                    +    "<th>状態</th>"
413                    +  "</tr>"
414                    +")?");
415     private static final Pattern ACTIVEVILLAGE =
416             compile(
417              "("
418                 +"</table>"
419             +")|(?:"
420                 +"<tr><td>"
421                 +"<a\u0020href=\"([^\"]*)\">([^<]*)</a>"
422                 +"(?:\u0020|</td><td>(?:通常|[^<]*)</td><td>)"
423                 +"<strong>"
424                     +"(?:\uff08(?:(午前)|(午後))\u0020)?"  // AMPM
425                     +"([0-9]+)"                              // 時
426                     +"(?:時\u0020|\\:)"
427                     +"([0-9]+)"                              // 分
428                     +"(?:\u0020|分\u0020更新\uff09)"
429                 +"</strong>"
430                 +"</td><td>"
431                 +"(?:"
432                     + "(参加者募集中(?:です。)?)"
433                     +"|(開始待ち(?:です。)?)"
434                     +"|(進行中(?:です。)?)"
435                     +"|(勝敗が決定しました。|エピローグ)"
436                     +"|(終了・ログ公開中。)"
437                 +")"
438                 +"</td></tr>"
439             +")"
440             );
441
442     /**
443      * トップページの村一覧表のパース。
444      * @throws HtmlParseException パースエラー
445      */
446     private void parseTopList() throws HtmlParseException{
447         setContextErrorMessage("lost village list");
448
449         SeqRange anchorRange  = this.rangepool_1;
450         SeqRange villageRange = this.rangepool_2;
451
452         if( ! findProbe(O_LISTTABLE_PATTERN) ) return;
453         shrinkRegion();
454         sweepSpace();
455
456         for(;;){
457             lookingAtAffirm(ACTIVEVILLAGE);
458             if(isGroupMatched(1)) break;
459             anchorRange .setLastMatchedGroupRange(getMatcher(), 2);
460             villageRange.setLastMatchedGroupRange(getMatcher(), 3);
461             int hour = parseGroupedInt(6);
462             if(isGroupMatched(5)){
463                 hour = (hour + 12) % 24;
464             }
465             int minute = parseGroupedInt(7);
466
467             VillageState state;
468             if(isGroupMatched(8)){
469                 state = VillageState.PROLOGUE;
470             }else if(isGroupMatched(9)){
471                 state = VillageState.PROLOGUE;
472             }else if(isGroupMatched(10)){
473                 state = VillageState.PROGRESS;
474             }else if(isGroupMatched(11)){
475                 state = VillageState.EPILOGUE;
476             }else if(isGroupMatched(12)){
477                 state = VillageState.GAMEOVER;
478             }else{
479                 assert false;
480                 throw buildParseException();
481             }
482
483             shrinkRegion();
484
485             sweepSpace();
486
487             this.basicHandler.villageRecord(getContent(),
488                                             anchorRange,
489                                             villageRange,
490                                             hour, minute,
491                                             state );
492         }
493
494         return;
495     }
496
497     private static final Pattern O_LISTLOG_PATTERN =
498             compile(
499             "<a\u0020href=\"(index[^\"]*(?:ready_0|000_ready))\">"
500             +"([^<]*)"
501             +"</a><br\u0020/>"
502             );
503
504     /**
505      * 村一覧ページのパース。
506      * @throws HtmlParseException パースエラー
507      */
508     private void parseLogList() throws HtmlParseException{
509         setContextErrorMessage("lost village list");
510
511         SeqRange anchorRange  = this.rangepool_1;
512         SeqRange villageRange = this.rangepool_2;
513
514         boolean is1st = true;
515         for(;;){
516             boolean matched;
517             if(is1st){
518                 matched = findProbe(O_LISTLOG_PATTERN);
519                 is1st = false;
520             }else{
521                 matched = lookingAtProbe(O_LISTLOG_PATTERN);
522             }
523             if( ! matched ) break;
524
525             anchorRange .setLastMatchedGroupRange(getMatcher(), 1);
526             villageRange.setLastMatchedGroupRange(getMatcher(), 2);
527
528             shrinkRegion();
529
530             this.basicHandler.villageRecord(getContent(),
531                                             anchorRange,
532                                             villageRange,
533                                             -1, -1,
534                                             VillageState.GAMEOVER );
535         }
536
537         return;
538     }
539
540     private static final Pattern C_BODY_PATTERN =
541             compile("</body>");
542     private static final Pattern C_HTML_PATTERN =
543             compile(SP_I+ "</html>" +SP_I);
544
545     /**
546      * XHTML末尾のパース。
547      * @throws HtmlParseException パースエラー
548      */
549     private void parseTail() throws HtmlParseException{
550         setContextErrorMessage("lost last part");
551
552         findAffirm(C_BODY_PATTERN);
553         shrinkRegion();
554
555         matchesAffirm(C_HTML_PATTERN);
556         shrinkRegion();
557
558         return;
559     }
560
561     private static final Pattern LISTTITLE_PATTERN =
562             compile("終了した村の記録");
563
564     /**
565      * 人狼BBSのページ種別を自動認識しつつパースする。
566      * @param content パース対象の文字列
567      * @throws HtmlParseException パースエラー
568      */
569     public void parseAutomatic(DecodedContent content)
570             throws HtmlParseException{
571         setContent(content);
572
573         this.basicHandler.startParse(getContent());
574
575         parseHead();
576
577         sweepSpace();
578
579         if(lookingAtProbe(LISTTITLE_PATTERN)){
580             shrinkRegion();
581             this.basicHandler.pageType(PageType.VILLAGELIST_PAGE);
582             parseLogList();
583         }else{
584             parseLoginForm();
585             sweepSpace();
586             if(lookingAtProbe(O_PARAG_PATTERN)){
587                 shrinkRegion();
588                 this.basicHandler.pageType(PageType.TOP_PAGE);
589                 parseTopList();
590             }else{
591                 this.basicHandler.pageType(PageType.PERIOD_PAGE);
592                 parseVillageInfo();
593                 parsePeriodLink();
594                 parseMessage();
595             }
596         }
597
598         parseTail();
599
600         this.basicHandler.endParse();
601
602         reset();
603
604         return;
605     }
606
607 }