Friday, May 27, 2011

运用import过程进行SAS数据导入完全实用教程


1 单个规范格式文件导入。
对单个文件进行导入是我们遇到最多的情况,主要有以下几种:

1.1 对指定分隔符(’|’,’ ’,’!’,’ab’等)数据的导入,这里以’!’为例delimiter='!'进行说明:
data _null_;
  file 'c:temppipefile.txt';
  put"X1!X2!X3!X4";
  put "11!22!.! ";
  put "111!.!333!apple";
run;
导入程序:
proc import
  datafile='c:temppipefile.txt'
  out=work.test
  dbms=dlm
  replace;
delimiter='!';
GUESSINGROWS=2000;
DATAROW=2;
  getnames=yes;
run;
注意GUESSINGROWS的值阈为1 到 3276

1.2 对CSV格式的数据进行导入:
data _null_;
  file 'c:tempcsvfile.csv';
  put "Fruit1,Fruit2,Fruit3,Fruit4";
  put "apple,banana,coconut,date";
  put "apricot,berry,crabapple,dewberry";
run;
导入程序:
proc import
  datafile='c:tempcsvfile.csv'
  out=work.fruit
  dbms=csv
  replace;
run;

1.3 对tab分隔数据的导入:
data _null_;
  file 'c:temptabfile.txt';
  put "cereal" "09"x "eggs" "09"x "bacon";
  put "muffin" "09"x "berries" "09"x "toast";
run;
proc import
  datafile='c:temptabfile.txt'
  out=work.breakfast
  dbms=tab
  replace;
  getnames=no;
run;

1.4 对dbf数据库数据进行导入:
proc import datafile="/myfiles/mydata.dbf"
              out=sasuser.mydata
              dbms=dbf
              replace;
run;

1.5对excel数据进行导入:
PROC IMPORT OUT= hospital1
            DATAFILE= " C:My DocumentsExcel FilesHospital1.xls "
            DBMS=EXCEL REPLACE;
     SHEET="Sheet1$";
     GETNAMES=YES;
     MIXED=NO;
     SCANTEXT=YES;
     USEDATE=YES;
     SCANTIME=YES;
RUN;

1.6对access数据进行导入:
PROC IMPORT DBMS=ACCESS TABLE="customers" OUT=sasuser.cust;
     DATABASE="c:democustomers.mdb";
     UID="bob";
     PWD="cat";                                WGDB="c:winntsystem32system.mdb"; 
RUN;
proc print data=sasuser.cust;
run;

1.7 import过程步中,dbms选项汇总:

Identifier
Input Data Source
Extension
Host Availability
ACCESS
Microsoft Access 2000 or 2002 table
.mdb
Microsoft Windows *
ACCESS97
Microsoft Access 97 table
.mdb
Microsoft Windows *
ACCESS2000
Microsoft Access 2000 table
.mdb
Microsoft Windows *
ACCESS2002
Microsoft Access 2002 table
.mdb
Microsoft Windows *
ACCESSCS
Microsoft Access table
.mdb
UNIX
CSV
delimited file (comma-separated values)
.csv
OpenVMS Alpha, UNIX, Microsoft Windows
DBF
dBASE 5.0, IV, III+, and III files
.dbf
UNIX, Microsoft Windows
DLM
delimited file (default delimiter is a blank)
.*
OpenVMS Alpha, UNIX, Microsoft Windows
EXCEL
Excel 2000 or 2002 spreadsheet
.xls
Microsoft Windows *
EXCEL4
Excel 4.0 spreadsheet
.xls
Microsoft Windows
EXCEL5
Excel 5.0 or 7.0 (95) spreadsheet
.xls
Microsoft Windows
EXCEL97
Excel 97 or 7.0 (95) spreadsheet
.xls
Microsoft Windows *
EXCEL2000
Excel 2000 spreadsheet
.xls
Microsoft Windows *
EXCELCS
Excel spreadsheet
.xls
UNIX
JMP
JMP table
.jmp
UNIX, Microsoft Windows
PCFS
Files on PC server
.*
UNIX
TAB
delimited file (tab-delimited values)
.txt
OpenVMS Alpha, UNIX, Microsoft Windows
WK1
Lotus 1-2-3 Release 2 spreadsheet
.wk1
Microsoft Windows
WK3
Lotus 1-2-3 Release 3 spreadsheet
.wk3
Microsoft Windows
WK4
Lotus 1-2-3 Release 4 or 5 spreadsheet
.wk4
Microsoft Windows


2 导入一个文件夹下的所有文件的数据。
2.1下面的代码导入一个文件夹下的所有文件的数据,要使用本代码需注意几点:首先,这个文件夹下的数据文件必须是同一类型分隔的数据,比如例子中都是tab分隔的txt文件,当然也可以对本代码进行改进,例如中间的proc import的dbms改为excel,就可以导入excel文件了。其次,本代码直接将文件名作为SAS数据集的名字,因此文件名必须是英文,且满足SAS命名规则。
%macro directory(dir=);
%let rs=%sysfunc(filename(filref,&dir));
%let did=%sysfunc(dopen(&filref));
%let nobs=%sysfunc(dnum(&did));
%do i=1 %to &nobs.;
    %let name=%qscan(%qsysfunc(dread(&did,&i)),1,.);
    %let ext=%qscan(%qsysfunc(dread(&did,&i)),-1,.);
    proc import out=&name. datafile="&dir.&name..&ext" dbms=tab replace;
        getnames=no;
        datarow=1;
    run;
%end;
%let rc=%sysfunc(dclose(&did));
%mend;
%directory(dir=C:PRIVATE);
如果要将数据集进行汇总到一张表,或者则可以直接将proc import out=&name中的&name改为a&i,然后对所有的a:数据集进行set操作。除此之外,我们还可以对&ext进行设置来达到读取指定文件格式的数据。

2.2 这里运用pipe读取到文件名称,再读取数据。首先建立三个数据集:
data _null_;
  file 'c:junkextfile1.txt';
  put "05JAN2001 6 W12301 1.59 9.54";
  put "12JAN2001 3 P01219 2.99 8.97";
run;
data _null_;
  file 'c:junkextfile2.txt';
  put "02FEB2001 1 P01219 2.99 2.99";
  put "05FEB2001 3 A00901 1.99 5.97";
  put "07FEB2001 2 C21135 3.00 6.00";
run;
data _null_;
  file 'c:junkextfile3.txt';
  put "06MAR2001 4 A00101 3.59 14.36";
  put "12MAR2001 2 P01219 2.99 5.98";
run    
filename blah pipe 'dir C:Junk /b';

data _null_;
  infile blah truncover end=last;
  length fname $20;
  input fname;
  i+1;
  call symput('fname'||trim(left(put(i,8.))),scan(trim(fname),1,'.'));
  call symput('pext'||trim(left(put(i,8.))),trim(fname));
  if last then call symput('total',trim(left(put(i,8.))));
run;
%macro test;
  %do i=1 %to &total;
     proc import datafile="c:Junk&&pext&i"
                 out=work.&&fname&i
                 dbms=dlm replace;
     delimiter=' ';
     getnames=no ;
     run;
     proc print data=work.&&fname&i;;
     title &&fname&i;
     run;
  %end;
%mend;
%test;
这里,如果要导入指定文件类型的数据,例如txt,则只需要将filename blah pipe 'dir C:Junk /b';改为filename blah pipe 'dir C:Junk.*.txt /b';即可。

除了用filename blah pipe 'dir C:Junk.*.txt /b';得到指定类型的文件名,我们还可以%sysexec dir *.xls /b/o:n > flist.txt;来将xls文件输出到指定的文件中,供读取操作用。这个将在下面的内容作介绍。


3 导入excel表中的所有sheet的数据,并将其汇总到一个数据表中。
3.1 Excel表是sas导入导出最多的数据表之一,本例中,我们将导入一个excel中的不同的数据
%let dir=C:ExcelFiles;      
%macro ReadXls (inf);
libname excellib excel "&dir.&inf";
proc sql noprint;
    create table sheetname as
    select tranwrd(memname, "''", "'") as sheetname
    from sashelp.vstabvw
    where libname="EXCELLIB";
    select count(DISTINCT sheetname) into :cnt_sht
    from sheetname;
    select DISTINCT sheetname into :sheet1 - :sheet%left(&cnt_sht)
    from sheetname;
quit;
libname excellib clear;
%do i=1 %to &cnt_sht;
 proc import datafile="&dir.&inf"
       out=sheet&i replace;
       sheet="&&sheet&i";
       getnames=yes;
       mixed=yes;      
 run;
proc append base=master data=sheet&i force;
   run;
%end;
%mend ReadXls;
%ReadXls(all1.xls);
这样,我们可以通过%ReadXls(all2.xls); %ReadXls(all3.xls);等来得到多个excel文件的所有数据集。

3.2 我们可以结合3.1和2.1或2.2的方法来读取多个文件中的多个表。这里再介绍一种新的读取多个文件的方法:
options noxwait;
%macro ReadXls (dir=);
%sysexec cd &dir; %sysexec dir *.xls /b/o:n > flist.txt;

data _indexfile;
    length filen $200;
    infile "&dir./flist.txt";
    input filen $;
run;

proc sql noprint;
    select count(filen) into :cntfile from _indexfile;
    %if &cntfile>=1 %then %do;
    select filen into :filen1-:filen%left(&cntfile)
    from _indexfile;
    %end;
quit;


%do i=1 %to &cntfile;
libname excellib excel "&dir.&&filen&i"; 
proc sql noprint;
    create table sheetname as
    select tranwrd(memname, "''", "'") as sheetname
    from sashelp.vstabvw
    where libname="EXCELLIB";
    select count(DISTINCT sheetname) into :cnt_sht
    from sheetname;
    select DISTINCT sheetname into :sheet1 - :sheet%left(&cnt_sht)
    from sheetname;
quit;
%do j=1 %to &cnt_sht;
     proc import datafile="&dir.&&filen&i"
          out=sheet&j replace;
          sheet="&&sheet&j";
          getnames=yes;
          mixed=yes;
     run;
data sheet&j;
   length _excelfilename $100 _sheetname $32;
   set sheet&j;
   _excelfilename="&&filen&z";
   _sheetname="&&sheet&j";
run;
proc append base=master data=sheet&j force;
run;
%end;
libname excellib clear; 
%end;
%mend ReadXls;
%readxls (dir=C:ExcelFiles);

4 从多个文件夹下读取多个数据。
直接给源代码吧。
%macro etl(ds, ds2,path);
data &ds &ds2;
LENGTH DateTime 8
UserName $ 20
Submit $ 10
SentNumber $ 11
IP $ 15
MessageID $ 15
SendingMode $ 6
Contents $ 160 ;

%let filrf=mydir;
%let rc=%sysfunc(filename(filrf,"&path"));
%let did=%sysfunc(dopen(&filrf));
%let memcount=%sysfunc(dnum(&did));
%do i=1 %to &memcount;
AccountNum+1;
%let counter = AccountNum;
%let username&i=%sysfunc(dread(&did,&i));

%let filref=mydir2;
%let file=%sysfunc(filename(filref,"&path&&username&i"));
%let op=%sysfunc(dopen(&filref));
%let flcount=%sysfunc(dnum(&op));

filename FT77F001 "D:SMSGatewayData2USERS&&username&i*.log";
%do j=1 %to &flcount;
%let trans&j=%sysfunc(dread(&op,&j));
%put '&&username&i = ' &&username&i '&&trans&j= ' &&trans&j '&flcount = ' &flcount '&filref = ' &filref '&filrf = ' &filrf;

infile FT77F001 filename=filename eov=eov end = done length=L DSD;
INPUT DateTime : ANYDTDTM19.
UserName $
Submit $
SentNumber $
IP $
MessageID $
SendingMode $
Contents $;
output;
%end;
%end;
run;
%mend;
%etl(sms2, sms,D:SMSGatewayData2USERS)
这里就不做测试了,原理其实跟3.2差不多,就是将目录也参数化。


5 参考文献:
A Case Study of Importing Multiple Worksheet Files
http://www2.sas.com/proceedings/sugi31/034-31.pdf

Reading multiple files with PROC IMPORT 
http://support.sas.com/kb/24/707.html

How to Invoke PROC IMPORT to read a CSV, TAB or Delimited File  
http://ftp.sas.com/techsup/download/sample/datastep/import.html

Use PROC IMPORT to read a CSV, TAB or delimited file.sas
Reading multiple files with PROC IMPORT.sas
directory_import_dbms.sas
proc_import_dbf.html
http://ftp.sas.com/

0 comments:

 
Copyright 2010 NiuNiu's Warehouse. Powered by Blogger
Blogger Templates created by DeluxeTemplates.net | Blogger Styles | Balance Transfer Credit Cards
Wordpress by Wpthemescreator
Blogger Showcase