/*
	Description: Creates the estimation sample used to generate all figures and tables
	
	Author: Brendan Price
	Prepared for public disclosure: 4/28/14
	
	NB: Adapted from the file "nber-ces-builder-v3.do" in our own file system.
*/

version 12.1
set more off
clear all
macro drop all
capture restore, not

*** Program to deflate all variables ***
capture program drop deflate_variables
program define deflate_variables
	* Deflate shipments
	gen real_vship = vship/piship

	* Deflate materials costs
	gen real_matcost = matcost/pimat

	* Deflate investments
	gen real_invest = invest/piinv

	* Deflate energy expenditures
	gen real_energy = energy/pien
	
	* Drop price deflators (will recompute later)
	drop piship pimat piinv pien

	* Rename variables for clarity
	rename prode prodemp
	rename prodw prodpay
	rename prodh prodhrs

	* Deflate total payroll and production payroll using the PCE (to 2007 dollars)
	merge m:1 year using "../../dta/pce/pce.dta", assert(2 3)
	keep if _merge == 3
	drop _merge
	
	gen real_pay = pay/pce
	gen real_prodpay = prodpay/pce
	drop pce

	* Retain nominal variables, but prepend "nom" to signify this
	rename vship nom_vship
	rename matcost nom_matcost
	rename invest nom_invest
	rename energy nom_energy
	rename pay nom_pay
	rename prodpay nom_prodpay
	rename cap nom_cap
end

*** Program to drop industries that are partly/wholly reclassified as non-manufacturing following the NAICS conversion ***
capture program drop drop_sic_inds
program define drop_sic_inds
	drop if sic == 2411
	drop if sic == 2711
	drop if sic == 2721
	drop if sic == 2731
	drop if sic == 2741
	drop if sic == 2771
end

******* START OF MAIN CODE BODY *******

/* Verify that the PCE is normalized to unity in 2007 */

use "../../dta/pce/pce.dta", clear
assert pce == 1 if year == 2007

/* Convert 1997-2009 data from NAICS to SIC codes using David Dorn's crosswalk */

* Load NBER data in NAICS format and restrict to variables of interest
use "../../dta/nber-ces/raw/naics5809.dta", clear
keep naics year vship matcost invest energy cap emp pay prode prodh prodw piship pimat piinv pien

* Use 1997-2009 data from the NAICS version of the dataset; panel should be balanced in these years
keep if year >= 1997 & year <= 2009
fillin naics year
assert _fillin == 0
drop _fillin

* Renormalize deflators to equal unity in 2007, instead of 1997
foreach v of varlist piship pimat piinv pien {
	assert `v' > 0 & `v' < .
	bysort naics: egen `v'_2007 = total(`v' * (year == 2007))
	replace `v' = `v'/`v'_2007
	drop `v'_2007
}

* Verify that all variables are always positive/populated
foreach v of varlist vship matcost invest energy cap emp pay prode prodh prodw piship pimat piinv pien {
	assert `v' > 0 & `v' < .
}

* Deflate shipments, inputs, and pay
deflate_variables

* Map into 1987 SIC codes
joinby naics using "../../xwalks/naics97_sic87.dta", unmatched(both) _merge(naics_merge)

* All NAICS industry codes should be accounted for
assert naics_merge == 2 | naics_merge == 3
keep if naics_merge == 3
drop naics_merge modified

* Weights should sum to unity within each industry, up to floating point error
bysort naics year: egen tot_weight = total(weight)
assert abs(1 - tot_weight) < .001
drop naics tot_weight

* Reweight in preparation for collapsing to the SIC87 level (NB: easy to omit one here, so I verify below)
foreach v of varlist emp prodemp prodhrs nom_pay nom_prodpay nom_vship nom_matcost nom_invest nom_energy nom_cap real_pay real_prodpay real_vship real_matcost real_invest real_energy {
	replace `v' = `v' * weight
	local reweighted_`v' "yes"
}
drop weight

* Verify that all quantitative variables were properly reweighted
quietly describe, varlist
local full_varlist = r(varlist)

foreach v of local full_varlist {
	* These variables are exceptions: all others should have been reweighted
	if "`v'" != "sic" & "`v'" != "year" {
		di "`v': `reweighted_`v''"
		assert "`reweighted_`v''" == "yes"
		macro drop reweighted_`v'
	}
}

* Drop instances in which industries are partly/fully mapped into non-manufacturing SIC codes
keep if sic >= 2000 & sic <= 3999

* Recollapse to the SIC87 level
collapse (sum) emp prodemp prodhrs nom_pay nom_prodpay nom_vship nom_matcost nom_invest nom_energy nom_cap real_pay real_prodpay real_vship real_matcost real_invest real_energy, by(year sic)

* Drop the greeting card industry (which is not handled consistently pre/post NAICS)
drop if sic == 2770

* Verify that this procedure results in a balanced panel of industries
quietly fillin sic year
assert _fillin == 0
drop _fillin

* Save temporary dataset and move on to the native SIC data
tempfile n1
save "`n1'.dta", replace

/*
	Deal with boat building (SIC 3732, NAICS 336612).
	The SIC series is incomplete and, moreover, includes "boat repairing"; the NAICS industry is boat building only.
	I therefore use the NAICS series throughout by using NAICS observations in place of SIC observations during the SIC years.
*/

use "../../dta/nber-ces/raw/naics5809.dta", clear
keep naics year vship matcost invest energy cap emp pay prode prodh prodw piship pimat piinv pien 
keep if naics == 336612
replace naics = 3732
rename naics sic
gen from_naics = 1
tempfile boats
save "`boats'.dta", replace

/* Working with native SIC data: for years 1963-1996. */

* Load NBER data in SIC format and restrict to variables of interest
use "../../dta/nber-ces/raw/sic5809.dta", clear
keep sic year vship matcost invest energy cap emp pay prode prodh prodw piship pimat piinv pien 

* Drop six industries that are partly/wholly reclassified as non-manufacturing following the conversion to NAICS codes
drop_sic_inds

* Asbestos (3292) is missing after 1991; pool it with nonmetallic minerals n.e.c. (3299), using the latter's price indices
foreach v of varlist vship matcost invest energy cap emp pay prode prodh prodw {
	bysort year: egen temp = total(`v' * (sic == 3292 | sic == 3299))
	replace `v' = temp if sic == 3299
	drop temp
}

drop if sic == 3292

* Use NAICS data instead of SIC data for the boat building industry (SIC 3732, NAICS 336612)
append using "`boats'.dta"
drop if sic == 3732 & from_naics == .
drop from_naics

* Renormalize deflators to equal unity in 2007, instead of 1997
foreach v of varlist piship pimat piinv pien {
	assert `v' > 0 & `v' < .
	bysort sic: egen `v'_2007 = total(`v' * (year == 2007))
	replace `v' = `v'/`v'_2007
	drop `v'_2007
}

* Use the data from 1963 through 1996 for this part
keep if year >= 1963 & year <= 1996

* Verify that we have a balanced panel of industries
quietly fillin sic year
assert _fillin == 0
drop _fillin

* Verify that all variables are always positive/populated
foreach v of varlist vship matcost invest energy cap emp pay prode prodh prodw piship pimat piinv pien {
	assert `v' > 0 & `v' < .
}

* Deflate shipments, inputs, and pay
deflate_variables

/*
	Combine data from the SIC years with data from the NAICS years.
	Save a cleaned dataset at the 4-digit level.
*/

* Append the data from the NAICS years
append using "`n1'.dta"

* Verify that no "auxiliary industry" codes (20001, 30001, etc.) are present
assert sic <= 9999

* Map into sic87dd codes
rename sic sic87
merge m:1 sic87 using "../../xwalks/sic87_sic87dd.dta", assert(3) nogenerate
drop sic87

* Verify that we only have manufacturing industries (after excluding the fishing industry)
drop if sic87dd == 0900
assert sic87dd >= 2000 & sic87dd <= 3999

* Recollapse to the sic87dd level
collapse (sum) emp prodemp prodhrs nom_pay nom_prodpay nom_vship nom_matcost nom_invest nom_energy nom_cap real_pay real_prodpay real_vship real_matcost real_invest real_energy, by(year sic87dd)

* Panel should now be balanced
quietly fillin sic87dd year
assert _fillin == 0
drop _fillin

* Verify that all variables are always positive/populated
foreach v of varlist emp prodemp prodhrs nom_pay nom_prodpay nom_vship nom_matcost nom_invest nom_energy nom_cap real_pay real_prodpay real_vship real_matcost real_invest real_energy {
	assert `v' > 0 & `v' < .
}

tempfile core
save "`core'.dta", replace

/* Prepare data on computer investment in 1977, 1982, and 1987 */

* Isolate total investment in each industry
use "../../dta/nber-ces/raw/sic5809.dta", clear
keep if year == 1977 | year == 1982 | year == 1987
keep year sic invest
assert invest > 0 & invest < .
reshape wide invest, i(sic) j(year)

* Drop six industries that are partly/wholly reclassified as non-manufacturing following the conversion to NAICS codes
* (Note: if we fail do so here, we will in some cases - such as sic87dd == 2731 - retain these industries in the computation of  computer investments while excluding them in the computation of overall investments.)
drop_sic_inds

* Convert from 1987 SIC codes to 1972 SIC codes
rename sic sic87
joinby sic87 using "../../xwalks/sic72_sic87.dta", unmatched(both) _merge(check)
assert check == 2 | check == 3
keep if check == 3
drop check

* 1972 shares should sum to unity within each 1987 SIC
bysort sic87: egen tot_sh8772 = total(sh8772)
assert abs(1 - tot_sh8772) < .0001
drop tot_sh8772

* Rescale variables
foreach y in 1977 1982 1987 {
	replace invest`y' = invest`y' * sh8772
}

* Collapse by 1972 SIC code
collapse (sum) invest1977 invest1982 invest1987, by(sic72)

* Merge in Berman, Bound, and Griliches data on computer investment ratios
rename sic72 sic4
merge 1:1 sic4 using "../../dta/bbg/ci778287.dta", assert(2 3)
keep if _merge == 3
drop _merge

rename ci77 ci1977
rename ci82 ci1982
rename ci87 ci1987

foreach y in 1977 1982 1987 {
	gen nom_compinvest`y' = ci`y' * invest`y'
}

keep sic4 nom_compinvest1977 nom_compinvest1982 nom_compinvest1987

* Map back from 1972 SIC codes into 1987 SIC codes
rename sic4 sic72
joinby sic72 using "../../xwalks/sic72_sic87.dta", unmatched(both) _merge(check)
assert check == 2 | check == 3
keep if check == 3
drop check

* 1987 shares should sum to unity within each 1972 SIC
bysort sic72: egen tot_sh7287 = total(sh7287)
assert abs(1 - tot_sh7287) < .0001
drop tot_sh7287

* Rescale variables
foreach y in 1977 1982 1987 {
	replace nom_compinvest`y' = nom_compinvest`y' * sh7287
}

* Collapse by 1987 SIC code
collapse (sum) nom_compinvest1977 nom_compinvest1982 nom_compinvest1987, by(sic87)

* Map into sic87dd codes
merge 1:1 sic87 using  "../../xwalks/sic87_sic87dd.dta", assert(2 3)
keep if _merge == 3
drop sic87 _merge

* Verify that we only have manufacturing industries (after excluding the fishing industry)
drop if sic87dd == 0900
assert sic87dd >= 2000 & sic87dd <= 3999
collapse (sum) nom_compinvest*, by(sic87dd)

* Compute real computer investments using the PCE
reshape long nom_compinvest, i(sic87dd) j(year)
merge m:1 year using "../../dta/pce/pce.dta", assert(2 3)
keep if _merge == 3
drop _merge

gen real_compinvest = nom_compinvest/pce
keep sic87dd year nom_compinvest real_compinvest
reshape wide nom_compinvest real_compinvest, i(sic87dd) j(year)

tempfile comp_investments_778287
save "`comp_investments_778287'.dta", replace

/* Prepare the 1992 computer investment data */

* Prepare total investments from the NBER-CES
use "../../dta/nber-ces/raw/sic5809.dta", clear
keep if year == 1992
replace sic = 2064 if sic == 2067
collapse (sum) invest, by(sic)
tempfile investments
save "`investments'.dta", replace

* Prepare computer and total investments from the 1992 Census of Manufactures pdf files
use "../../dta/cm/cm1992.dta", clear
merge 1:1 sic using "`investments'.dta", assert(3) nogenerate
keep sic invest nom_invest_cm1992 nom_compinvest_cm1992 min_imputation max_imputation comments

* Impute suppressed values where possible ("Z" means "<.5"; "D" and "S" are missing/suppressed values)
replace nom_compinvest_cm1992 = "0" if nom_compinvest_cm1992 == "Z"
replace nom_compinvest_cm1992 = string((min_imputation + max_imputation)/2) if (nom_compinvest_cm1992 == "D" | nom_compinvest_cm1992 == "S") & (min_imputation < . & max_imputation < .)
replace nom_compinvest_cm1992 = "" if nom_compinvest_cm1992 == "D" | nom_compinvest_cm1992 == "S"
destring nom_compinvest_cm1992, replace
drop min_imputation max_imputation comments

* Asbestos industry (SIC 3292) is missing from the NBER-CES post-1991; I drop 3292's computer investments (suppressed anyway)
drop if sic == 3292
assert abs(nom_invest_cm1992 - invest) < .001
drop invest

* Drop six industries that are partly/wholly reclassified as non-manufacturing following the conversion to NAICS codes
drop_sic_inds

* Sanity checks
assert nom_compinvest_cm1992 <= nom_invest_cm1992 if nom_compinvest_cm1992 < .
assert nom_compinvest_cm1992 < . if sic == 3571 | sic == 3674

* Map into sic87dd codes
keep sic nom_compinvest_cm1992
rename sic sic87
merge 1:1 sic87 using "../../xwalks/sic87_sic87dd.dta", assert(2 3)
keep if _merge == 3
drop sic87 _merge

* Verify that we only have manufacturing industries (after excluding the fishing industry)
drop if sic87dd == 0900
assert sic87dd >= 2000 & sic87dd <= 3999

* Collapse to sic87dd level and retain missing values
bysort sic87dd: egen num_nonmissing = total(nom_compinvest_cm1992 < .)
bysort sic87dd: gen num_industries = _N
collapse (sum) nom_compinvest_cm1992 (mean) num_nonmissing (mean) num_industries, by(sic87dd)
assert nom_compinvest_cm1992 == 0 if num_nonmissing == 0
replace nom_compinvest_cm1992 = . if num_nonmissing == 0
drop num_nonmissing

* Compute real computer investments using the PCE
gen year = 1992
merge m:1 year using "../../dta/pce/pce.dta", assert(2 3)
keep if _merge == 3
drop _merge

gen real_compinvest_cm1992 = nom_compinvest_cm1992/pce
keep sic87dd nom_compinvest_cm1992 real_compinvest_cm1992

tempfile comp_investments_1992
save "`comp_investments_1992'.dta", replace

/* Prepare the 2002/2007 computer investment data */

foreach y of numlist 2002 2007 {
	* Extract the data we want from 2002/2007 Census of Manufactures data downloaded from Census
	if `y' == 2002 {
		use "../../dta/cm/cm2002.dta", clear
	}
	else if `y' == 2007 {
		use "../../dta/cm/cm2007.dta", clear
	}
	
	keep naicsid emp rcptot cstmtot cextot cexmchc
	rename naicsid naics
	rename emp emp_cm`y'
	rename rcptot nom_vship_cm`y'
	rename cstmtot nom_matcost_cm`y'
	rename cextot nom_invest_cm`y'
	rename cexmchc nom_compinvest_cm`y'

	* Recreate value added (for consistency with how we clean the NBER-CES)
	gen nom_vadd_cm`y' = nom_vship_cm`y' - nom_matcost_cm`y'
	drop nom_vship_cm`y' nom_matcost_cm`y'

	* Express employment/investments in the same units used in the NBER-CES
	replace emp_cm`y' = emp_cm`y'/1000
	replace nom_vadd_cm`y' = nom_vadd_cm`y'/1000
	replace nom_invest_cm`y' = nom_invest_cm`y'/1000
	replace nom_compinvest_cm`y' = nom_compinvest_cm`y'/1000

	/* Map from 2007 NAICS codes into 2002 NAICS codes */
	if `y' == 2007 {
		rename naics naics07
		
		joinby naics07 using "../../xwalks/naics07_naics02.dta", unmatched(master)
		replace naics02 = naics07 if _merge == 1
		replace weight = 1 if _merge == 1

		* Weights should sum to unity within each industry, up to floating point error
		bysort naics07: egen tot_weight = total(weight)
		assert abs(1 - tot_weight) < .001

		* Collapse to the level of 2002 NAICS
		foreach v of varlist emp_cm`y' nom_vadd_cm`y' nom_invest_cm`y' nom_compinvest_cm`y' {
			replace `v' = `v' * weight
		}

		collapse (sum) emp_cm`y' (sum) nom_vadd_cm`y' (sum) nom_invest_cm`y' (sum) nom_compinvest_cm`y', by(naics02)
	}

	/* Map from 2002 NAICS codes into 1997 NAICS codes */
	if `y' == 2002 {
		rename naics naics02
	}
	
	joinby naics02 using "../../xwalks/naics02_naics97.dta", unmatched(master)
	replace naics97 = naics02 if _merge == 1
	replace weight = 1 if _merge == 1

	* Weights should sum to unity within each industry, up to floating point error
	bysort naics02: egen tot_weight = total(weight)
	assert abs(1 - tot_weight) < .001

	* Collapse to the level of 1997 NAICS
	foreach v of varlist emp_cm`y' nom_vadd_cm`y' nom_invest_cm`y' nom_compinvest_cm`y' {
		replace `v' = `v' * weight
	}

	collapse (sum) emp_cm`y' (sum) nom_vadd_cm`y' (sum) nom_invest_cm`y' (sum) nom_compinvest_cm`y', by(naics97)
	rename naics97 naics

	/* Map from 1997 NAICS codes into 1987 SIC codes */
	joinby naics using "../../xwalks/naics97_sic87.dta", unmatched(both) _merge(naics_merge)

	assert naics_merge == 2 | naics_merge == 3
	keep if naics_merge == 3
	drop naics_merge modified

	* Weights should sum to unity within each industry, up to floating point error
	bysort naics: egen tot_weight = total(weight)
	assert abs(1 - tot_weight) < .001
	drop naics tot_weight

	* Reweight in preparation for collapsing to the SIC87 level
	foreach v of varlist emp_cm`y' nom_vadd_cm`y' nom_invest_cm`y' nom_compinvest_cm`y' {
		replace `v' = `v' * weight
	}

	* Drop instances in which industries are partly/fully mapped into non-manufacturing SIC codes
	keep if sic >= 2000 & sic <= 3999

	* Recollapse to the SIC87 level
	collapse (sum) emp_cm`y' (sum) nom_vadd_cm`y' (sum) nom_invest_cm`y' (sum) nom_compinvest_cm`y', by(sic)

	* Drop the greeting card industry (which is not handled consistently pre/post NAICS)
	drop if sic == 2770

	* Verify that no "auxiliary industry" codes (20001, 30001, etc.) are present
	assert sic <= 9999

	* Map into sic87dd codes
	rename sic sic87
	merge 1:1 sic87 using "../../xwalks/sic87_sic87dd.dta", assert(2 3)
	keep if _merge == 3
	drop sic87 _merge

	* Verify that we only have manufacturing industries (after excluding the fishing industry)
	drop if sic87dd == 0900
	assert sic87dd >= 2000 & sic87dd <= 3999

	* Recollapse to the sic87dd level
	collapse (sum) emp_cm`y' (sum) nom_vadd_cm`y' (sum) nom_invest_cm`y' (sum) nom_compinvest_cm`y', by(sic87dd)

	* Compute real computer investments using the PCE
	gen year = `y'
	merge m:1 year using "../../dta/pce/pce.dta", assert(2 3)
	keep if _merge == 3
	drop _merge

	gen real_compinvest_cm`y' = nom_compinvest_cm`y'/pce
	keep sic87dd emp_cm`y' nom_vadd_cm`y' nom_invest_cm`y' nom_compinvest_cm`y' real_compinvest_cm`y'

	tempfile comp_investments_`y'
	save "`comp_investments_`y''.dta", replace
}

/* Prepare 1988 and 1993 SMT data (used in Doms, Dunne, and Troske 1997) */

foreach y of numlist 88 93 {
	* Load data on technology use within industries 3400-3800
	use "../../dta/smt/smt`y'.dta", clear
	assert sic >= 3400 & sic <= 3899

	* Retain employment-weighted technology measures (i.e., number of employees working in plants that use each technology)
	keep sic totemp *02
	
	* Verify that the share of employees using a technology never exceeds unity (up to rounding error)
	foreach v of varlist *02 {
		assert `v' < totemp + 1
		replace `v' = totemp if `v' > totemp
	}

	if `y' == 88 {
		* 1988 data appear are coded in 1977 SIC codes; I use the 1972-1987 SIC crosswalk with some modification
		* (see http://www.census.gov/epcd/www/SIC1987%20to%20SIC1977%20correspondence%20tables.pdf)
		rename sic sic72
		joinby sic72 using "../../xwalks/sic72_sic87.dta", unmatched(master) _merge(check)
		assert sic72 == 3673 | sic72 == 3716 | sic72 == 3790 if check == 1

		replace sic87 = 3671 if sic72 == 3673
		replace sic87 = 3716 if sic72 == 3716
		replace sic87 = 3799 if sic72 == 3790
		replace sh7287 = 1 if sic72 == 3673 | sic72 == 3716 | sic72 == 3790
		
		* 1987 shares should sum to unity within each 1972 SIC
		bysort sic72: egen tot_sh7287 = total(sh7287)
		assert abs(1 - tot_sh7287) < .0001
		drop tot_sh7287

		* Rescale variables
		foreach v of varlist totemp *02 {
			replace `v' = `v' * sh7287
		}
	
		keep sic87 totemp *02
		collapse (sum) totemp *02, by(sic87)
	}
	else if `y' == 93 {
		* 1993 data are reported in 1987 SIC codes
		rename sic sic87
	}

	* Map into sic87dd codes
	merge 1:1 sic87 using "../../xwalks/sic87_sic87dd.dta", assert(2 3)
	keep if _merge == 3
	collapse (sum) totemp *02, by(sic87dd)

	* Compute the share of employees working in plants that use each technology
	foreach v of varlist *02 {
		gen share_`v' = `v'/totemp
		drop `v'
	}
	
	* Verify that there are 17 technologies
	quietly lookfor share
	local num_techs : word count `r(varlist)'
	assert `num_techs' == 17
	
	* Compute the fraction of these 17 technologies to which the average worker is exposed in each industry
	gen smtshare_19`y' = 0
	foreach v of varlist share* {
		replace smtshare_19`y' = smtshare_19`y' + 100 * (`v'/17)
	}
	
	* Restrict the technology variable to industries in SIC 34-38
	keep if sic87dd >= 3400 & sic87dd <= 3899
	assert smtshare_19`y' >= 0 & smtshare_19`y' <= 100
	
	keep sic87dd smtshare_19`y'
	tempfile smt`y'
	save "`smt`y''.dta", replace
}

/* Combine files to finalize the dataset */

* Start with data at the 4-digit level
use "`core'.dta", clear

* Merge in 1977/1982/1987 computer investments
merge m:1 sic87dd using "`comp_investments_778287'.dta", assert(3) nogenerate

* Merge in 1992 computer investments
merge m:1 sic87dd using "`comp_investments_1992'.dta", assert(3) nogenerate

* Merge in 2002/2007 computer investments
foreach y of numlist 2002 2007 {
	merge m:1 sic87dd using "`comp_investments_`y''.dta", assert(1 3)
	
	if `y' == 2002 {
		assert _merge == 3
	}
	else if `y' == 2007 {
		* Impute zero computer investments for industry 3142
		assert sic87dd == 3142 if _merge == 1

		preserve
		keep if sic87dd == 3142 & year == 2007
		gen nom_compinvest_cm2007_imp = 0
		gen real_compinvest_cm2007_imp = 0
		gen emp_cm2007_imp = emp
		gen nom_invest_cm2007_imp = nom_invest
		gen nom_vadd_cm2007_imp = nom_vship - nom_matcost
		keep sic87dd *imp
		tempfile imputations
		save "`imputations'.dta", replace
		restore
		
		merge m:1 sic87dd using "`imputations'.dta", assert(1 3) nogenerate
		foreach v of varlist nom_compinvest_cm2007 real_compinvest_cm2007 emp_cm2007 nom_invest_cm2007 nom_vadd_cm2007 {
			replace `v' = `v'_imp if sic87dd == 3142
			drop `v'_imp
		}
	}
	
	drop _merge
}

* Merge in 1988/1993 SMT data on technology usage
foreach y of numlist 88 93 {
	merge m:1 sic87dd using "`smt`y''.dta", assert(1 3)
	assert _merge == 3 if sic87dd >= 3400 & sic87dd <= 3899
	assert _merge == 1 if sic87dd < 3400 | sic87dd > 3899
	drop _merge
}

* Back out price indices from real/nominal variables
gen piship = nom_vship/real_vship
gen pimat = nom_matcost/real_matcost
gen piinv = nom_invest/real_invest
gen pien = nom_energy/real_energy

foreach v of varlist piship pimat piinv pien {
	assert abs(`v' - 1) < .0001 if year == 2007
	replace `v' = 1 if year == 2007
}

* Compute nominal value added
gen nom_vadd = nom_vship - nom_matcost

* Compute Laspeyres and Paasche versions of real value added (using both 1991 and 2007 as the base year)
assert piship == 1 if year == 2007
assert pimat == 1 if year == 2007

foreach y of numlist 1991 2007 {
	bysort sic87dd: egen piship`y' = total(piship * (year == `y'))
	bysort sic87dd: egen pimat`y' = total(pimat * (year == `y'))
	
	gen real_vship_`y'b = nom_vship/(piship/piship`y')
	gen real_matcost_`y'b = nom_matcost/(pimat/pimat`y')
	gen real_vadd_`y'b = real_vship_`y'b - real_matcost_`y'b
}

* The computation based on the 2007 base year should coincide with what I'd get if I simply used real_vship and real_matcost
assert abs(real_vadd_2007b - (real_vship - real_matcost)) < 1
drop piship1991 piship2007 pimat1991 pimat2007 real_vship_1991b real_vship_2007b real_matcost_1991b real_matcost_2007b

/* Compute 4-factor TFP */

* Cost shares (forced to lie between zero and one)
gen nonprod_share = max(0, (nom_pay - nom_prodpay)/nom_vship)
gen prod_share = nom_prodpay/nom_vship
gen mat_share = min(nom_matcost, nom_vship - nom_pay)/nom_vship
gen cap_share = max(0, 1 - prod_share - nonprod_share - mat_share)

foreach v of varlist *share {
	assert `v' >= 0 & `v' <= 1
}

assert abs(nonprod_share + prod_share + mat_share + cap_share - 1) < .001

* Use average of current and lagged cost share
foreach v in "nonprod" "prod" "mat" "cap" {
	bysort sic87dd (year): gen `v'_share_lag = `v'_share[_n - 1]
	gen alpha_`v' = (`v'_share + `v'_share_lag)/2
}

assert abs(alpha_nonprod + alpha_prod + alpha_mat + alpha_cap - 1) < .001 if year > 1963

* Quantities of each factor
gen q_nonprod = emp - prodemp
gen q_prod = prodhrs
gen q_mat = nom_matcost/pimat
gen q_cap = nom_cap

foreach v in "nonprod" "prod" "mat" "cap" {
	bysort sic87dd (year): gen q_`v'_lag = q_`v'[_n - 1]
	gen dlog_`v' = log(q_`v') - log(q_`v'_lag)
}

* Change in real shipments
bysort sic87dd (year): gen real_vship_lag = real_vship[_n - 1]
gen dlog_vship = log(real_vship) - log(real_vship_lag)

* Change in 4-factor TFP
gen dtfp = dlog_vship - (alpha_nonprod * dlog_nonprod) - (alpha_prod * dlog_prod) - (alpha_mat * dlog_mat) - (alpha_cap * dlog_cap)

* Level of 4-factor TFP (normalized to unity in 1997)
bysort sic87dd (year): gen sum_dtfp = sum(dtfp)
bysort sic87dd (year): egen base = total(sum_dtfp * (year == 1997))
replace sum_dtfp = sum_dtfp - base
gen tfp = exp(sum_dtfp)
assert tfp == 1 if year == 1997

* Merge in 1-digit sectors
merge m:1 sic87dd using "../../xwalks/sectors.dta", assert(2 3)
keep if _merge == 3
drop _merge

* Calculate measures of baseline computerization for 1977/1982/1987
foreach y in 1977 1982 1987 {
	* Calculate computer investments as a share of all investments
	bysort sic87dd: egen nom_invest`y' = total(nom_invest * (year == `y'))
	gen ci_per_invest_`y' = nom_compinvest`y'/nom_invest`y'
	
	* Calculate real computer investments per worker
	bysort sic87dd: egen emp`y' = total(emp * (year == `y'))
	gen ci_per_emp_`y' = real_compinvest`y'/emp`y'
}

* Calculate computerization rates in 1992
bysort sic87dd: egen nom_invest1992 = total(nom_invest * (year == 1992))
gen ci_per_invest_1992 = nom_compinvest_cm1992/nom_invest1992

bysort sic87dd: egen emp1992 = total(emp * (year == 1992))
gen ci_per_emp_1992 = nom_compinvest_cm1992/emp1992

* Calculate computerization rates in 2002/2007 (note: take denominators straight from the CM, not from the NBER-CES)
foreach y of numlist 2002 2007 {
	gen ci_per_invest_`y' = nom_compinvest_cm`y'/nom_invest_cm`y'
	gen ci_per_emp_`y' = real_compinvest_cm`y'/emp_cm`y'
}

/* Provide several definitions of the "computer sector" */

* Narrowest definition: just computers themselves + semiconductors
gen comp_narrowest = (sic87dd == 3571 | sic87dd == 3674)

* Narrow definition: select computer-related industries (NB: sic87dd code 3577 includes SIC code 3575)
gen comp_narrow = (sic87dd == 3571 | sic87dd == 3572 | sic87dd == 3577 | sic87dd == 3674)

* Medium definition: select 3-digit computer industries
gen comp_medium = (sic87dd >= 3570 & sic87dd <= 3579) | (sic87dd >= 3660 & sic87dd <= 3669) | (sic87dd >= 3670 & sic87dd <= 3679)

* Broad definition: try to match the Houseman et al. definition (based on NAICS sector 334---)		
gen comp_broad = 0
replace comp_broad = 1 if (sic87dd >= 3570 & sic87dd <= 3579) & (sic87dd != 3579)
replace comp_broad = 1 if (sic87dd == 3651 | sic87dd == 3652)
replace comp_broad = 1 if (sic87dd >= 3660 & sic87dd <= 3669)
replace comp_broad = 1 if (sic87dd >= 3670 & sic87dd <= 3679)
replace comp_broad = 1 if (sic87dd == 3695)
replace comp_broad = 1 if (sic87dd == 3812 | sic87dd == 3822 | sic87dd == 3823 | sic87dd == 3824 | sic87dd == 3825 | sic87dd == 3826 | sic87dd == 3829 | sic87dd == 3844 | sic87dd == 3845 | sic87dd == 3873)

* Verify that the definitions are nested (except that "broad" does not nest "medium")
assert comp_narrow if comp_narrowest
assert comp_medium if comp_narrow
assert comp_broad if comp_narrow

* Drop the clutter
keep sic87dd year sector* comp_* emp prodemp prodhrs nom_pay nom_prodpay nom_vship nom_vadd nom_matcost nom_invest nom_energy nom_cap real_pay real_prodpay real_vship real_vadd_* real_matcost real_invest real_energy nom_compinvest* real_compinvest* emp_cm* nom_vadd_cm* nom_invest_cm* ci_per_invest_* ci_per_emp_* smtshare_1988 smtshare_1993 piship pimat piinv pien dtfp tfp

* Merge in industry descriptions
merge m:1 sic87dd using "../../xwalks/sic87dd_key.dta", assert(2 3)
keep if _merge == 3
drop _merge

* Label variables
label var emp "Employment (000s)"
label var prodemp "Production Employment (000s)"
label var prodhrs "Production Workers Hours (mil)"

label var nom_pay "Nominal Payroll (\$mil)"
label var nom_prodpay "Nominal Production Workers Payroll (\$mil)"
label var nom_vship "Nominal Value of Shipments (\$mil)"
label var nom_matcost "Nominal Materials Costs (\$mil)"
label var nom_invest "Nominal Investments (\$mil)"
label var nom_energy "Nominal Energy Purchases (\$mil)"
label var nom_cap "Nominal Capital Stock (\$mil)"

label var real_pay "Real Payroll (mil 2007\$)"
label var real_prodpay "Real Production Workers Payroll (mil 2007\$)"
label var real_vship "Real Value of Shipments (mil 2007\$)"
label var real_matcost "Real Materials Costs (mil 2007\$)"
label var real_invest "Real Investments (mil 2007\$)"
label var real_energy "Real Energy Purchases (mil 2007\$)"

label var piship "Shipments Deflator (= 1 in 2007)"
label var pimat "Materials Deflator (= 1 in 2007)"
label var piinv "Investment Deflator (= 1 in 2007)"
label var pien "Energy Deflator (= 1 in 2007)"

label var nom_vadd "Nominal Value Added (\$mil)"
label var real_vadd_1991b "Real Value Added (mil 1991\$)"
label var real_vadd_2007b "Real Value Added (mil 2007\$)"

label var dtfp "Change in 4-Factor TFP"
label var tfp "4-Factor TFP"

label var nom_compinvest1977 "Nominal Computer Investments in 1977 (\$mil)"
label var nom_compinvest1982 "Nominal Computer Investments in 1982 (\$mil)"
label var nom_compinvest1987 "Nominal Computer Investments in 1987 (\$mil)"

label var real_compinvest1977 "Real Computer Investments in 1977 (mil 2007\$, via PCE)"
label var real_compinvest1982 "Real Computer Investments in 1982 (mil 2007\$, via PCE)"
label var real_compinvest1987 "Real Computer Investments in 1987 (mil 2007\$, via PCE)"

label var ci_per_invest_1977 "Computer Investments/Total Investments in 1977"
label var ci_per_invest_1982 "Computer Investments/Total Investments in 1982"
label var ci_per_invest_1987 "Computer Investments/Total Investments in 1987"

label var ci_per_emp_1977 "Real Computer Investments/Worker in 1977 (000s of 2007\$, via PCE)"
label var ci_per_emp_1982 "Real Computer Investments/Worker in 1982 (000s of 2007\$, via PCE)"
label var ci_per_emp_1987 "Real Computer Investments/Worker in 1987 (000s of 2007\$, via PCE)"

label var nom_compinvest_cm1992 "Nominal Computer Investments in the 1992 CM Data (\$mil)"
label var real_compinvest_cm1992 "Real Computer Investments in 1992 (mil 2007\$, via PCE)"
label var ci_per_invest_1992 "Computer Investments/Total Investments in 1992"
label var ci_per_emp_1992 "Real Computer Investments/Worker in 1992 (000s of 2007\$, via PCE)"

foreach y of numlist 2002 2007 {
	label var emp_cm`y' "Employment in the `y' CM Data (000s)"
	label var nom_invest_cm`y' "Nominal Investments in the `y' CM Data (\$mil)"
	label var nom_vadd_cm`y' "Nominal Value Added in the `y' CM Data (\$mil)"
	label var nom_compinvest_cm`y' "Nominal Computer Investments in the `y' CM Data (\$mil)"
	label var real_compinvest_cm`y' "Real Computer Investments in the `y' CM Data (mil 2007\$)"
	label var ci_per_invest_`y' "Computer Investments/Total Investments in `y'"
	label var ci_per_emp_`y' "Real Computer Investments/Worker in `y' (000s of 2007\$, via PCE)"
}

label var comp_narrowest "Computer Sector: Narrowest Definition"
label var comp_narrow "Computer Sector: Narrow Definition"
label var comp_medium "Computer Sector: Medium Definition"
label var comp_broad "Computer Sector: Broad Definition (Houseman et al.)"

* Save a cleaned dataset
save "../../dta/nber-ces/clean/nber-ces-clean.dta", replace

/* Create a spreadsheet indicating which industries comprise the computer sector under each definition */

use "../../dta/nber-ces/clean/nber-ces-clean.dta", clear
keep if year == 1991
keep sic87dd sic87dd_desc emp comp_*
order sic87dd sic87dd_desc emp comp_narrowest comp_narrow comp_medium comp_broad

label var sic87dd "4-Digit Ind (sic87dd)"
label var sic87dd_desc "Ind Description"
label var emp "1991 Emp (000s)"
label var comp_narrowest "Narrowest (3571, 3674)"
label var comp_narrow "Narrow (3571, 3572, 3577, 3674)"
label var comp_medium "Medium (3570s, 3660s, 3670s)"
label var comp_broad "Broad (Houseman et al.)"

export excel using "../../out/auxiliary/Computer-Sector-Definitions.xlsx", sheetreplace sheet("Raw") firstrow(varlabels)




