lexer.cpp -- CLexerの実装を開始。

author derui <derutakayu@user.sourceforge.jp>

Sun, 7 Jun 2009 01:19:40 +0000 (10:19 +0900)

committer derui <derutakayu@user.sourceforge.jp>

Sun, 7 Jun 2009 01:19:40 +0000 (10:19 +0900)
author derui <derutakayu@user.sourceforge.jp>
Sun, 7 Jun 2009 01:19:40 +0000 (10:19 +0900)
committer derui <derutakayu@user.sourceforge.jp>
Sun, 7 Jun 2009 01:19:40 +0000 (10:19 +0900)
diff --git a/GPATH b/GPATH

index 31aa8fd..e750c10 100644 (file)

Binary files a/GPATH and b/GPATH differ
diff --git a/GRTAGS b/GRTAGS

index 6cdb446..d79173e 100644 (file)

Binary files a/GRTAGS and b/GRTAGS differ
diff --git a/GSYMS b/GSYMS

index 2818b2c..eae3b7f 100644 (file)

Binary files a/GSYMS and b/GSYMS differ
diff --git a/GTAGS b/GTAGS

index a156633..c8aecf5 100644 (file)

Binary files a/GTAGS and b/GTAGS differ
diff --git a/Makefile b/Makefile

index 6566ddb..d9afd1f 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -96,7 +96,7 @@ CC = gcc
  CCDEPMODE = depmode=gcc3
  CFLAGS = -g -O2
  CPP = gcc -E
-CPPFLAGS = 
+CPPFLAGS = -Wall
  CXX = g++
  CXXDEPMODE = depmode=gcc3
  CXXFLAGS = -g -O2
diff --git a/lexer.cpp b/lexer.cpp

index ceb391e..2a76332 100644 (file)
--- a/lexer.cpp
+++ b/lexer.cpp
@@ -12,11 +12,63 @@ smart_ptr<lexer::CLexeme> lexer::CLexer::lex(smart_ptr<utf8::CUTF8InputStream>&
      // 何か一つの非終端記号、終端記号を読みだすたびにそれを返す。
  
      // まずは何はなくとも1文字読みだす。
-    utf8_string::CUTF8Char ch(stream->read());
  
-    if (utf8_string::is_eof(ch))
-    {
-        // eofを示す値を返す。
-        return smart_ptr<lexer::CLexeme>();
+    // EOF以外の値の場合には、通常の字句解析を行っていく。
+    utf8_string::CUTF8String str();
+    bool first = true;
+    while (!stream->isEOF()) {
+
+        // 意味のある文字のみで構成していく。
+
+        utf8_string::CUTF8Char ch(stream->read());
+
+
+        // 最初の一文字で、大体決定されるため、firstをチェックして
+        // いく。
+        if (first)
+        {
+            first = false;
+
+            // ()[]`',.は構文解析の重要な要素となるため、このまま返す。
+            if (ch.toUTF16Code() == '(' || ch.toUTF16Code() == '[')
+            {
+                return makeStartParen();
+            }
+
+            if (ch.toUTF16Code() == ')' || ch.toUTF16Code() == ']')
+            {
+                return makeEndParen();
+            }
+
+            if (ch.toUTF16Code() == '`')
+            {
+                return makeBackQuote();
+            }
+
+            if (ch.toUTF16Code() == '\'')
+            {
+                return makeQuote();
+            }
+
+            if (ch.toUTF16Code() == '.')
+            {
+                return makeDot();
+            }
+
+            if (ch.toUTF16Code() == ',')
+            {
+                return makeComma();
+            }
+        }
+        else
+        {
+            // 読出した文字がデリミタであるかどうか。
+            if (isDelimiter(ch))
+            {
+                // デリミタ文字である場合、このデリミタ文字に来るまでの間に
+                // 保存した文字列から、実際のデータを作成する。
+                
+            }
+        }
      }
  }
diff --git a/lexer.h b/lexer.h

index 4c920f2..bffb28b 100755 (executable)
--- a/lexer.h
+++ b/lexer.h
@@ -8,7 +8,8 @@
  namespace utakata {
  
      namespace lexer {
-    
+
+        class CLexeme;
          class CLexer
          {
              /**
@@ -48,8 +49,8 @@ namespace utakata {
              // つまりは全体をpimplにしておき、データはそれらから取得するようにするということ。
              // それぞれを取得するためのインターフェースはこれから作成される。
          public:
-            ILexeme();
-            virtual ~ILexeme(){}
+            CLexeme();
+            virtual ~CLexeme(){}
  
              // 終端記号、非終端記号のIDを取得する。
              int getID() const;
diff --git a/test/GPATH b/test/GPATH

index 3786bbc..4b8dbcd 100644 (file)

Binary files a/test/GPATH and b/test/GPATH differ
diff --git a/test/GRTAGS b/test/GRTAGS

index 855dc45..e1cb885 100644 (file)

Binary files a/test/GRTAGS and b/test/GRTAGS differ
diff --git a/test/GSYMS b/test/GSYMS

index f2225c6..9870ee1 100644 (file)

Binary files a/test/GSYMS and b/test/GSYMS differ
diff --git a/test/GTAGS b/test/GTAGS

index f3711c1..55bdbc5 100644 (file)

Binary files a/test/GTAGS and b/test/GTAGS differ
diff --git a/test/Makefile b/test/Makefile

index 8619fd1..ca4ee04 100644 (file)
--- a/test/Makefile
+++ b/test/Makefile
@@ -81,7 +81,7 @@ CC = gcc
  CCDEPMODE = depmode=gcc3
  CFLAGS = -g -O2
  CPP = gcc -E
-CPPFLAGS = 
+CPPFLAGS = -Wall
  CXX = g++
  CXXDEPMODE = depmode=gcc3
  CXXFLAGS = -g -O2
diff --git a/test/textarrayformat_test b/test/textarrayformat_test

index cb2b969..5f57e76 100755 (executable)

Binary files a/test/textarrayformat_test and b/test/textarrayformat_test differ
diff --git a/test/utf8_string_test b/test/utf8_string_test

index f8ba269..450afc0 100755 (executable)

Binary files a/test/utf8_string_test and b/test/utf8_string_test differ
diff --git a/test/utf8_string_test.cpp b/test/utf8_string_test.cpp

index c6f0c48..9e12bba 100644 (file)
--- a/test/utf8_string_test.cpp
+++ b/test/utf8_string_test.cpp
@@ -99,6 +99,10 @@ bool utf8_string_util_test(smart_ptr<simpletest::CSimpleTestAsserter> asserter)
      str3.insert(str3.end(), str2.begin(), str2.end());
      asserter->check(str3.toStr(), "あsいうえsssお");
  
+    // 互いに加算できる。
+    utakata::utf8_string::CUTF8String str4 = str + str2;
+    asserter->check(str4.toStr(), "ssおssおあいうえs");
+
      return asserter->isOk();
  }
  
diff --git a/test/utf8_test b/test/utf8_test

index a67c453..aba1943 100755 (executable)

Binary files a/test/utf8_test and b/test/utf8_test differ
diff --git a/test/utf8_test.cpp b/test/utf8_test.cpp

index c8f0247..80c0ea3 100644 (file)
--- a/test/utf8_test.cpp
+++ b/test/utf8_test.cpp
@@ -1,6 +1,8 @@
  #include <iostream>
  #include <sstream>
  #include <string>
+#include <algorithm>
+
  #include <functional>
  
  
@@ -34,6 +36,9 @@ bool utf8_charcheck_test(smart_ptr<simpletest::CSimpleTestAsserter> asserter)
      asserter->check(stream.peek()[0], 'T'); 
      asserter->check(stream.read()[0], 'T');
  
+    // ここの時点ではまだeofではない。
+    asserter->check(stream.isEOF(), false);
+
      // 複数文字の読み出しのチェック
      const std::vector<unsigned char> f = stream.read(2);
      std::string t("", f.size());
@@ -42,6 +47,10 @@ bool utf8_charcheck_test(smart_ptr<simpletest::CSimpleTestAsserter> asserter)
                     identity_local());
      
      asserter->check(t, "F8");
+
+    // この時点でeofがtrueとなる
+    asserter->check(stream.isEOF(), true);
+    asserter->check(stream.read()[0], 0xff);
      
      return asserter->isOk();
  }
diff --git a/utf8.cpp b/utf8.cpp

index caac3eb..6f7c50c 100755 (executable)
--- a/utf8.cpp
+++ b/utf8.cpp
@@ -5,7 +5,6 @@
  #include <sstream>
  #include <assert.h>
  
-
  #include "InputStream.h"
  #include "utf8.h"
  #include "smart_ptr.h"
@@ -13,11 +12,11 @@
  using namespace std;;
  using namespace utakata::utf8;
  
-CUTF8InputStream::CUTF8InputStream() : strm_()
+CUTF8InputStream::CUTF8InputStream() : EOF_(0xff), strm_()
  {
  }
  
-CUTF8InputStream::CUTF8InputStream(const smart_ptr<std::istream>& strm) : strm_(strm)
+CUTF8InputStream::CUTF8InputStream(const smart_ptr<std::istream>& strm) : EOF_(0xff), strm_(strm)
  {
  }
  
@@ -75,7 +74,7 @@ std::vector<unsigned char> CUTF8InputStream::read()
          }
      }
  
-    return std::vector<unsigned char>(0xff);
+    return std::vector<unsigned char>(EOF_);
  }
  
  std::vector<unsigned char> CUTF8InputStream::read(int num)
@@ -83,20 +82,22 @@ std::vector<unsigned char> CUTF8InputStream::read(int num)
      // 指定された文字分だけ読みだしてくる。
      // 途中で終了した場合、その文字の分だけunsigned charが減少すること
      // になっている。
-    std::vector<unsigned char> rtn;
-    for (int i = 0; i < num; ++i)
+    // numが0の場合、必ず空のvectorが返される。
+
+    if (num == 0)
+    {
+        return std::vector<unsigned char>();
+    }
+
+    // eofの場合なら、この時点でeofが返るので、それで問題はない。
+    std::vector<unsigned char> rtn = this->read();
+    for (int i = 1; i < num && !strm_->eof(); ++i)
      {
+        // 個数に到達するか、もしくはeofとなるまでは追加しつづける。
          std::vector<unsigned char> tmp = this->read();
-        if (tmp.size() > 0)
-        {
-            rtn.insert(rtn.end(), tmp.begin(), tmp.end());
-        }
-        else
-        {
-            // サイズより大きくなってしまうような場合には、そのまま抜けることにする。
-            break;
-        }
+        rtn.insert(rtn.end(), tmp.begin(), tmp.end());
      }
+    
      return rtn;
  }
  
@@ -115,6 +116,17 @@ std::vector<unsigned char> CUTF8InputStream::peek()
      return tmp;
  }
  
+bool utakata::utf8::CUTF8InputStream::isEOF() const
+{
+    if (strm_->good())
+    {
+        return strm_->eof() ? true : false;
+    }
+    else
+    {
+        return false;
+    }
+}
  
  //================================================================================
  
diff --git a/utf8.h b/utf8.h

index d3889c4..00d0acb 100755 (executable)
--- a/utf8.h
+++ b/utf8.h
@@ -34,6 +34,9 @@ namespace utakata {
                 入力ストリームから、UTF-8のデータを指定した文字だけ読みだして
                 返す。
              */
+
+            const unsigned char EOF_;
+            
          public:
              
              // 入力に利用するストリームは最初に渡される。
@@ -49,7 +52,10 @@ namespace utakata {
  
              std::vector<unsigned char> peek();
  
-
+            // ファイルの終端に到達しているかどうかを返す。
+            // trueを返す場合、readの結果は常にEOF文字を返す。
+            bool isEOF() const;
+            
          private:
  
              smart_ptr<std::istream> strm_;
diff --git a/utf8_string.cpp b/utf8_string.cpp

index b46074c..c5dce3f 100644 (file)
--- a/utf8_string.cpp
+++ b/utf8_string.cpp
@@ -1,6 +1,8 @@
  #include <vector>
  #include <string>
  #include <functional>
+#include <algorithm>
+
  #include "utf8.h"
  #include "utf8_string.h"
  
@@ -83,6 +85,14 @@ bool utakata::utf8_string::is_eof(const CUTF8Char& ch)
      return ch.getBytes()[0] == 0xff ? true : false;
  }
  
+CUTF8String utakata::utf8_string::operator+(const CUTF8String& lh, const CUTF8String& rh)
+{
+    // 双方をコピーして加算して返す。凄い負荷が高い。
+    CUTF8String str(lh);
+    str += rh;
+    return str;
+}
+
  //================================================================================
  
  utakata::utf8_string::CUTF8String::CUTF8String() : chars_()
diff --git a/utf8_string.h b/utf8_string.h

index 5b66851..59451de 100644 (file)
--- a/utf8_string.h
+++ b/utf8_string.h
@@ -125,6 +125,7 @@ namespace utakata {
              // 実体に代入する。代入が行われなかった場合、元のデータ
              // は保存される。
              void assign(const std::vector<unsigned char>& bytes);
+            void assign(const std::vector<CUTF8Char>& chars);
              void assign(const CUTF8String& str);
  
              // iteratorを取得する。
@@ -166,6 +167,8 @@ namespace utakata {
              std::vector<CUTF8Char> chars_;
          };
  
+        CUTF8String operator+(const CUTF8String& lh, const CUTF8String& rh);
+
          // substringの実装を行う。
          // [begin, end)までの文字を文字列として返す。
          // endが渡されないか、0が渡された場合、beginから末尾までが返される。
author	derui <derutakayu@user.sourceforge.jp>
	Sun, 7 Jun 2009 01:19:40 +0000 (10:19 +0900)
committer	derui <derutakayu@user.sourceforge.jp>
	Sun, 7 Jun 2009 01:19:40 +0000 (10:19 +0900)
GPATH		patch \| blob \| history
GRTAGS		patch \| blob \| history
GSYMS		patch \| blob \| history
GTAGS		patch \| blob \| history
Makefile		patch \| blob \| history
lexer.cpp		patch \| blob \| history
lexer.h		patch \| blob \| history
test/GPATH		patch \| blob \| history
test/GRTAGS		patch \| blob \| history
test/GSYMS		patch \| blob \| history
test/GTAGS		patch \| blob \| history
test/Makefile		patch \| blob \| history
test/textarrayformat_test		patch \| blob \| history
test/utf8_string_test		patch \| blob \| history
test/utf8_string_test.cpp		patch \| blob \| history
test/utf8_test		patch \| blob \| history
test/utf8_test.cpp		patch \| blob \| history
utf8.cpp		patch \| blob \| history
utf8.h		patch \| blob \| history
utf8_string.cpp		patch \| blob \| history
utf8_string.h		patch \| blob \| history