FORS8000
Week 10: How do I love SAS, let me count the
ways-- counts, means , sums, and related information
Save example dataset species.xls (in Excel 2000) to a disk or on your hard drive. Create a library named "easy" and import the Excel file into SAS.
/***create a working (copy) datafile in the work directory and print**/
data species2;
set species;
proc print data = species2;
run;
/******* Count up the number of times a species occurs in datafile "species2"***/
proc freq data = species2;
tables
species;
/** the "tables" command specifies the variable
to count **/
run;
/******* Count up the number of times a species occurs, output to a file spc1, and print ***/
proc freq data = species2 noprint; /**
the "noprint" option suppresses the printed output from proc freq **/
tables species /out = spc1; /*** the
output datafile is specified by the "/out =" command **/
proc print data = spc1;
run;
/**Count up the number of state- species combinations **/
proc freq data = species2;
tables
state*species;
/** The star (*) indicates that the state
by species combinations should be counted **/
run;
/******* Count up the number of state- species combinations, output to a file spc1, and print ***/
proc freq data = species2 noprint;
tables state*species/ out = spc1;
proc print data = spc1;
run;
/**Count up the number of times a "count" value is in the datafile. Notice that "proc freq" treats number and character (e.g., state, species) values identically **/
proc freq data = species2;
tables count;
run;
/******* Count up the number of times a "count" value is in the datafile by state. Remember to sort by a variable before performing an operation by that variable ***/
proc sort data = species2;
by state;
proc freq data = species2;
by state;
tables count;
run;
/******* Count up the number of times a "count" value is in the data
file for the Florida (FL) data only ***/proc freq data = species2;
where state = 'FL';
tables count;
run;
/***
Calculate the mean, standard deviation, minimum and maximum of the variable count ***/proc means data = species2;
var count; /** the "var"
statement specifies the variable for which the means etc. are estimated **/
run;
/***
Calculate the mean, standard deviation, minimum and maximum of the variable count by state **/proc sort data = species2;
by state;
proc means data = species2;
by state;
var count;
run;
/*** Calculate the mean, standard deviation, minimum and maximum of the variable count by state **/
proc means data = species2;
where species = 'dog';
var count;
run;
/*** Calculate
various statistics of the variable count by state, output to datafile means1, and print **/various statistics of the variable count by state, output to datafile means1, and print **/proc sort data = species2;
by state;
proc means data = species2 noprint; /**
the "noprint" option suppresses the printed output from proc means **/
by state;
var count;
/** the output
statement below specifies the name of the output data file datafile with
"out = " and the variables to be calculated such as the mean, standard
deviation (stddev),
standard error (stderr), maximum (max), minimum (min), and sum (sum)
**/
output out = means1 mean = da_mean stddev = da_sd stderr = da_stder max =
the_max min = the_min sum = the_sum;
proc print data = means1;
run;
/*** Calculate the relative abundance of the species in each state and print **/
/** Step 1: calculate the total abundance (sum) of animals in each state using proc means and output to datafile means2**/
proc sort data = species2;
by state;
proc means data = species2 noprint;
by state;
var count;
output out = means2 sum = the_sum;
/** Step 2: create datafile "species3" by merging "species2" and "means2"**/
data species3;
merge species2 means2;
by state;
/** Step 3: calculate the relative abundance (rel_abs) by dividing the count (aka abundance) for each species by the total abundance (sum) of animals in each state, drop the useless variables "_type_" and "_freq_", and print **/
data species3;
set species3;
rel_abs = count/the_sum;
drop _type_ _freq_;
proc print;
run;
/** For grins, sort the data by state and relative abundance and print **/
proc sort data = species3;
by state rel_abs;
proc print;
run;
/
*** Calculate the mean, standard deviation, various statistics of the variable count in datafile species2 **/proc univariate data = species2;
var count;
run;
/*** Calculate
various statistics of the variable count, output to datafile means3, and print **/proc univariate data = species2 noprint;
var count;
/** the output
statement below specifies the name of the output data file datafile with
"out = " and the variables to be calculated such as the mean, standard
deviation (std),
standard error (stderr), maximum (max), minimum (min), and sum (sum)
**/
output out = means3 mean = da_mean std = da_sd stderr = da_stder max = the_max
min = the_min sum = the_sum;
proc print;
run;
/** Notice that "proc univariate" and "proc means" are used to calculate similar statistics. However, each procedure also has certain outputs that are specific to the procedure. One such output is the percentile calculation for a variable. **/
/*** Calculate
the 10th through 100th percentiles by 10% for variable count and output to datafile "pctls" **/proc univariate data = species2 noprint;
var count;
/** the output
statement below specifies the name of the output data file datafile with
"out = ", the prefix for the column headings containing the
percentiles
"pctlpre =", and the
percentiles points "pctlpts =" **/
output out=pctls pctlpre=P_ pctlpts= 10 to 100 by 10;
proc print;
run;
/* Plotting the percentile points to create a cumulative frequency distribution requires a datafile with the percentiles and percentiles in 2 columns. Thus, transpose the output datafile "pctls" and create "t_pctles" */
proc transpose data = pctls out = t_pctls;
proc print;
run;
/** Now create a datafile "plot" containing a column "pctile" with the values 10 through 100 by 10 **/
data plot;
do pctile = 10 to 100 by 10;
output;
end;
/** Merge datafile "plot" with the transposed datafile containing the percentile points "t_pctls" and print **/
data plot;
merge plot t_pctls;
proc print;
run;
/** Create a cumulative frequency plot by plotting the percentile points "col1" by the corresponding percentiles "pctile". **/
proc plot data = plot;
plot pctile*col1 = '*';
run;
quit;