lexer.cpp -- lexerの実装を開始。

author derui <derutakayu@user.sourceforge.jp>

Sat, 6 Jun 2009 16:31:09 +0000 (01:31 +0900)

committer derui <derutakayu@user.sourceforge.jp>

Sat, 6 Jun 2009 16:31:09 +0000 (01:31 +0900)
author derui <derutakayu@user.sourceforge.jp>
Sat, 6 Jun 2009 16:31:09 +0000 (01:31 +0900)
committer derui <derutakayu@user.sourceforge.jp>
Sat, 6 Jun 2009 16:31:09 +0000 (01:31 +0900)
diff --git a/GPATH b/GPATH

index dcc4a56..31aa8fd 100644 (file)

Binary files a/GPATH and b/GPATH differ
diff --git a/GRTAGS b/GRTAGS

index ccb0cef..6cdb446 100644 (file)

Binary files a/GRTAGS and b/GRTAGS differ
diff --git a/GSYMS b/GSYMS

index 1b37d12..2818b2c 100644 (file)

Binary files a/GSYMS and b/GSYMS differ
diff --git a/GTAGS b/GTAGS

index e29228a..a156633 100644 (file)

Binary files a/GTAGS and b/GTAGS differ
diff --git a/common/GRTAGS b/common/GRTAGS

index 349fa6d..3789319 100644 (file)

Binary files a/common/GRTAGS and b/common/GRTAGS differ
diff --git a/common/GSYMS b/common/GSYMS

index 9e734fe..5501e02 100644 (file)

Binary files a/common/GSYMS and b/common/GSYMS differ
diff --git a/lexer.cpp b/lexer.cpp

new file mode 100644 (file)

index 0000000..ceb391e
--- /dev/null
+++ b/lexer.cpp
@@ -0,0 +1,22 @@
+#include <iostream>
+
+#include "lexer.h"
+
+using namespace utakata;
+
+smart_ptr<lexer::CLexeme> lexer::CLexer::lex(smart_ptr<utf8::CUTF8InputStream>& stream)
+{
+    // 渡されたCUTF8InputStreamから、1文字ずつ読んでいき、各構文を解釈
+    // する。
+
+    // 何か一つの非終端記号、終端記号を読みだすたびにそれを返す。
+
+    // まずは何はなくとも1文字読みだす。
+    utf8_string::CUTF8Char ch(stream->read());
+
+    if (utf8_string::is_eof(ch))
+    {
+        // eofを示す値を返す。
+        return smart_ptr<lexer::CLexeme>();
+    }
+}
diff --git a/lexer.h b/lexer.h

index 1135618..4c920f2 100755 (executable)
--- a/lexer.h
+++ b/lexer.h
@@ -2,6 +2,8 @@
  #define _LEXER_H_
  
  #include "smart_ptr.h"
+#include "utf8.h"
+#include "utf8_string.h"
  
  namespace utakata {
  
@@ -30,24 +32,38 @@ namespace utakata {
                 渡されたUTF8を解釈するstreamから、データを解釈して、結果を返す。
                 結果は、smart_ptr<CLexeme>で返される。
              */
-            smart_ptr<CLexeme> lex(smart_ptr<CUTF8InputStream>& stream);
+            smart_ptr<CLexeme> lex(smart_ptr<utakata::utf8::CUTF8InputStream>& stream);
  
          private:
          
          };
  
-        class ILexeme
+        class CLexeme
          {
-            // 非終端記号、及び終端記号を表すベースクラス。
+            // 非終端記号、及び終端記号を表すクラス。
+            // それぞれ分割しても問題なかったのだが、それぞれのデータが
+            // 互いに非可換であるが、それぞれ継承してもどうしようもないため、
+            // さしあたって必要となる全ての型を取得できるようにしておき、
+            // それをそのまま取得してもらう、という形にする。
+            // つまりは全体をpimplにしておき、データはそれらから取得するようにするということ。
+            // それぞれを取得するためのインターフェースはこれから作成される。
          public:
              ILexeme();
              virtual ~ILexeme(){}
  
              // 終端記号、非終端記号のIDを取得する。
-            virtual int getID() const = 0;
+            int getID() const;
  
-            // そのものを表す文字列を返す。
-            virtual const CUTF8String getString() const = 0;
+            // stringのデータ型において、文字列を取得する。
+            smart_ptr<utakata::utf8_string::CUTF8String> getString() const;
+
+        private:
+            // 必要な型全てをまとめるための構造体。ただし、
+            // 内部のどれか一つだけが有効となっており、これを必要と
+            // しない終端記号のデータも存在する。
+            // 詳しくは各データを解釈する場所にて。
+            struct PImpl;
+            smart_ptr<PImpl> pimpl_;
          };
  
      };
diff --git a/test/utf8_string_test b/test/utf8_string_test

index 6917983..f8ba269 100755 (executable)

Binary files a/test/utf8_string_test and b/test/utf8_string_test differ
diff --git a/utf8.cpp b/utf8.cpp

index 75c36b4..caac3eb 100755 (executable)
--- a/utf8.cpp
+++ b/utf8.cpp
@@ -40,7 +40,7 @@ std::vector<unsigned char> CUTF8InputStream::read()
      // UTF-8の一文字を読みだして返す。
      // UTF-8に該当しない場合、空のvectorを返す。
      if (!strm_->good()) {
-        return std::vector<unsigned char>(0);
+        throw CStreamException("not ready input stream");
      }
  
      // 最初に一文字だけ読みだして、チェックをかける。
@@ -75,7 +75,7 @@ std::vector<unsigned char> CUTF8InputStream::read()
          }
      }
  
-    return std::vector<unsigned char>(0);
+    return std::vector<unsigned char>(0xff);
  }
  
  std::vector<unsigned char> CUTF8InputStream::read(int num)
diff --git a/utf8.h b/utf8.h

index 01927ec..d3889c4 100755 (executable)
--- a/utf8.h
+++ b/utf8.h
@@ -4,6 +4,8 @@
  #include <iostream>
  #include <string>
  #include <vector>
+#include <exception>
+
  #include "smart_ptr.h"
  #include "InputStream.h"
  
@@ -11,6 +13,21 @@ namespace utakata {
  
      namespace utf8 {
  
+        // inputstreamの準備が出来ていない場合に送出される例外
+        class CStreamException : public std::exception
+        {
+        public:
+            CStreamException(const std::string& str) : str_(str) {}
+            virtual ~CStreamException() throw() {}
+
+            const char* what() throw() {
+                return str_.c_str();
+            }
+        private:
+
+            std::string str_;
+        };
+
          class CUTF8InputStream : public IInputStream
          {
              /**
diff --git a/utf8_string.cpp b/utf8_string.cpp

index 83d35ae..b46074c 100644 (file)
--- a/utf8_string.cpp
+++ b/utf8_string.cpp
@@ -67,6 +67,22 @@ std::string utakata::utf8_string::CUTF8Char::toStr() const
      return tmp;
  }
  
+bool utakata::utf8_string::is_ascii_char(const CUTF8Char& ch)
+{
+    // 0x7f >= ascii >= 0x00 がasciiなので、その範囲で判定を行う。
+    if (ch.toUTF16Code() >= 0 && ch.toUTF16Code() < 0x80
+        && ch.getBytes().size() == 1)
+    {
+        return true;
+    }
+    return false;
+}
+
+bool utakata::utf8_string::is_eof(const CUTF8Char& ch)
+{
+    return ch.getBytes()[0] == 0xff ? true : false;
+}
+
  //================================================================================
  
  utakata::utf8_string::CUTF8String::CUTF8String() : chars_()
diff --git a/utf8_string.h b/utf8_string.h

index 63d51ef..5b66851 100644 (file)
--- a/utf8_string.h
+++ b/utf8_string.h
@@ -92,6 +92,11 @@ namespace utakata {
          // 渡されたCUTF8Charがasciiコードの範囲内に収まっているかどうかを返す。
          bool is_ascii_char(const CUTF8Char& ch);
  
+
+        // UTF8では先頭1バイトが0xffになることはありえないので、
+        // 先頭1バイトが0xffの場合には、これは終端記号であるとした。
+        bool is_eof(const CUTF8Char& ch);
+
          //================================================================================
  
          class CUTF8String
author	derui <derutakayu@user.sourceforge.jp>
	Sat, 6 Jun 2009 16:31:09 +0000 (01:31 +0900)
committer	derui <derutakayu@user.sourceforge.jp>
	Sat, 6 Jun 2009 16:31:09 +0000 (01:31 +0900)
GPATH		patch \| blob \| history
GRTAGS		patch \| blob \| history
GSYMS		patch \| blob \| history
GTAGS		patch \| blob \| history
common/GRTAGS		patch \| blob \| history
common/GSYMS		patch \| blob \| history
lexer.cpp	[new file with mode: 0644]	patch \| blob
lexer.h		patch \| blob \| history
test/utf8_string_test		patch \| blob \| history
utf8.cpp		patch \| blob \| history
utf8.h		patch \| blob \| history
utf8_string.cpp		patch \| blob \| history
utf8_string.h		patch \| blob \| history