Friday, May 27, 2011

DATA Step与PROC SQL的对比

DATA Step与PROC SQL的对比
DATA Step vs. PROC SQL: What’s a neophyte to do? -
这篇文章比较简单,因此只是把一些SAS和SQL相似的功能列举出来,供以后查阅。

1 创建数据集
1.1 从非关系数据库的数据源创建新数据集
数据步data step:
DATA table1;
    INPUT charvar1     $3.
       +1 charvar2     $1.
       +1 numvar1
       +1 numvar2   DATE7.
          ;
DATALINES;
me1 F 35786 10oct50
me3 M 57963 25jun49
fg6 M 25754 17jun47
fg7 F . 17aug53
PROC PRINT DATA=table1;
RUN;

SQL语句:PROC SQL
PROC SQL;
    CREATE TABLE table1
       (  charvar1 CHAR(3)
        , charvar2 CHAR(1)
        , numvar1  NUM
        , numvar2  NUM INFORMAT=DATE7.)
    ;
    INSERT INTO table1
       VALUES('me1','F',35786,'10oct50'd,)
       VALUES('me3','M',57963,'25jun49'd)
       VALUES('fg6','M',25754'17jun47'd)
       VALUES('fg7','F',.,'17aug53'd)
    ;
    SELECT *
        FROM table1;
QUIT;

1.2 从关系数据库的数据源创建新数据集
数据步data step:
LIBNAME olib ORACLE
      SAS/ACCESS-engine-connection-options
   ;
DATA table1;
    SET olib.oracle_table;
    IF var1 = "value1" THEN...
...

SQL语句:PROC SQL
PROC SQL;
    CONNECT TO rdbms 
            AS dbref (dbms-definitions);
    CREATE TABLE table3 AS
    SELECT col1, col2, col3
       FROM CONNECTION TO dbref
         (SELECT col1, col2, col3
             FROM table1, table2
             WHERE table1.co11
                 = table2.col5
             ORDER BY col1
         );
    DISCONNECT FROM dbref;
QUIT;

2 排序数据
SAS过程步排序:
PROC SORT DATA=table1
           OUT=table2
          NODUPLICATES;
    BY var1 var2;

SQL语句:PROC SQL
PROC SQL;
    CREATE TABLE table2 AS
    SELECT DISTINCT *
      FROM table1
     ORDER BY var1 var2;
QUIT;

3 连接数据
3.1 数据集基础连接
SAS数据步:
DATA table3;
    SET table1
        table2;

SQL语句:PROC SQL
PROC SQL;
   CREATE TABLE table3 AS
      SELECT *
         FROM table1
      OUTER UNION CORRESPONDING
      SELECT *
         FROM table2;
QUIT;

3.2 匹配合并
数据步DATA Step:
DATA table3;
    MERGE table1 (IN=l)
          Table2 (IN=r);
    BY keyvar;
    IF l AND r;        

SQL语句:PROC SQL
PROC SQL;
    CREATE TABLE table3 AS
    SELECT *
      FROM      table1 AS l 
     INNER JOIN table2 AS r
        ON l.keyvar=r.keyvar;
QUIT;

4 取子集
4.1 行数据子集
数据步DATA Step:
DATA table2;
    SET table1(WHERE=(var1=value1));

DATA table4;
    SET table3
    IF var1=value1 AND 
       var2 IN (value-list);

DATA table7;
    MERGE table5 table6;
    BY var1;
    IF M0D(var4,3) NE 0 THEN DELETE; 

SQL语句:PROC SQL
PROC SQL;
    CREATE TABLE table3 AS
    SELECT var1, var2, var3, var4
      FROM table1 AS a,
           table2 AS b
     WHERE a.var1=b.var1
       AND a.var1 IN (value-list);
QUIT;

PROC SQL;
   CREATE TABLE table4 AS
   SELECT *
     FROM table3;
   DELETE
     FROM table4
    WHERE var1 IS MISSING;
QUIT;

4.2 列数据子集
数据步DATA Step:
DATA table2;
    SET table1(DROP=var4-var6);

DATA table4;
    MERGE table2 table3;
    BY keyvar;
    KEEP keyvar var1 var2; 

SQL语句:PROC SQL
PROC SQL;
    CREATE TABLE table1 AS
    SELECT var1, var2, var3, var6
      FROM table2;
QUIT;

PROC SQL;
    CREATE TABLE table2(DROP=var4-var6) AS
    SELECT *
      FROM table1;
QUIT;

5 转换数据
5.1 创建新变量
数据步DATA Step:
DATA table2;
    SET table1;
    newvar1 = oldvar2 / oldvar3;
    newvar2 = SUBSTR(charvar5,3,5);

SQL语句:PROC SQL
PROC SQL;
    CREATE TABLE table2 AS
       SELECT var1, var2, 
              oldvar2/oldvar3 AS newvar1
       FROM table1;
QUIT;

5.2 汇总数据
SAS过程步:
PROC SORT DATA=table1;
  BY catvar1;

PROC MEANS DATA=table1;
    BY catvar1;
    VAR var4;
    OUTPUT OUT=table2 
           SUM=totvar4
             N=cntvar4
         NMISS=missvar4    ;

SQL语句:PROC SQL
PROC SQL;
    CREATE TABLE table2 AS
        SELECT catvar1,
               SUM(var4) AS totvar4,
               COUNT(var4) AS cntvar4,
               NMISS(var4) AS missvar4
        FROM table1
        GROUP BY catvar1
        ORDER BY catvar1
    ;
QUIT;

6 数据步与SQL功能对比


7 数据步与SQL优势对比
SQL语句:PROC SQL
PROC SQL包含了很多SAS数据步和过程步的功能
一般实现的语句不会太复杂或太长
对于小的数据表,SQL更有效
SQL是标准化的数据库操作语言,因此基础使用人群更多
并不需要显性地去排序数据集
SQL的join的时候,并不要求变量相同,但长度和类型要求相同
SQL可以直接打印数据集结果,但是可以用noprint禁止打印
可以对数据集用别名

SAS:
用处理更多的数据集
可以同时对文件读取与写入数据
数据处理更灵活
更方便地使用关系数据库的数据
更容易地输入非关系数据库的外部数据

8 技术的选择
熟练或熟悉程度:
准确性:
可维护性:
人力资源的有效性:
处理资源的有效性:
将来的应用:

0 comments:

 
Copyright 2010 NiuNiu's Warehouse. Powered by Blogger
Blogger Templates created by DeluxeTemplates.net | Blogger Styles | Balance Transfer Credit Cards
Wordpress by Wpthemescreator
Blogger Showcase