Code
source(here::here("r", "libraries.r"))
library(jsonlite)
library(tidyjson)source(here::here("r", "libraries.r"))
library(jsonlite)
library(tidyjson)tddir <- r"(E:\data\taxdata-psl)"
tcvars <- readRDS(here::here("data", "tcvars.rds"))
# ht(tcvars)utcvars <- tcvars |>
select(vname, desc) |>
distinct() |>
add_case(vname=c("n", "nret"), desc=c("# records", "# returns (millions)"))
sdf <- readRDS(path(tddir, "scratch", "tcoutput.rds")) # created in prelims.qmd
# count(sdf, src)sdf |>
summarise(n=n(),
nret=sum(s006),
across(
c(c00100, e00200, e00300, e00400, e00600, e00650, e00650, e00800, e00900, e01100, e01400,
e01500, e01700, e02100, e02300, e02400,
e03150, e03210, e03270, e03300, e17500, e18400, e32800,
c05800, taxbc, othertaxes, iitax, payrolltax),
~sum(.x * s006)), .by=src) |>
pivot_longer(-src, names_to = "vname") |>
pivot_wider(names_from = src) |>
mutate(pemtd=pe - td, pdiff=pemtd / td, rn=row_number()) |>
left_join(utcvars, by = join_by(vname)) |>
gt() |>
cols_hide(rn) |>
tab_header(
title = html("Comparison of weighted sums, selected variables, Policy Engine (pe), taxdata same variables (td), and taxdata all variables (tdall)"),
subtitle = html("Tax year 2023, dollar amounts in billions")
) |>
cols_label(tdall=html("taxdata (tdall) - all variables"),
td=html("taxdata (td)"),
pe=html("Policy Engine (pe)"),
pemtd=html("Policy Engine minus taxdata (pemtd)"),
pdiff=html("Policy Engine % difference from taxdata")) |>
fmt_number(columns=c(tdall, td, pe, pemtd),
rows=1,
decimals=0) |>
fmt_number(columns=c(tdall, td, pe, pemtd),
rows=2,
scale=1e-6,
decimals=2) |>
fmt_number(columns=c(tdall, td, pe, pemtd),
rows=rn > 2,
scale=1e-9,
decimals=2) |>
fmt_percent(columns=pdiff,
decimals=1)| Comparison of weighted sums, selected variables, Policy Engine (pe), taxdata same variables (td), and taxdata all variables (tdall) | ||||||
|---|---|---|---|---|---|---|
| Tax year 2023, dollar amounts in billions | ||||||
| vname | taxdata (td) | Policy Engine (pe) | taxdata (tdall) - all variables | Policy Engine minus taxdata (pemtd) | Policy Engine % difference from taxdata | desc |
| n | 252,868 | 155,312 | 252,868 | −97,556 | −38.6% | # records |
| nret | 208.42 | 189.54 | 208.42 | −18.88 | −9.1% | # returns (millions) |
| c00100 | 14,680.95 | 15,820.46 | 15,655.28 | 1,139.51 | 7.8% | Adjusted Gross Income (AGI) |
| e00200 | 10,310.66 | 10,928.44 | 10,310.66 | 617.78 | 6.0% | Wages, salaries, and tips for filing unit net of pension contributions |
| e00300 | 137.18 | 213.66 | 137.18 | 76.47 | 55.7% | Taxable interest income |
| e00400 | 94.74 | 201.72 | 94.74 | 106.98 | 112.9% | Tax-exempt interest income |
| e00600 | 405.99 | 504.30 | 405.99 | 98.31 | 24.2% | Ordinary dividends included in AGI |
| e00650 | 285.10 | 324.38 | 285.10 | 39.28 | 13.8% | Qualified dividends included in ordinary dividends |
| e00800 | 19.39 | 2.64 | 19.39 | −16.75 | −86.4% | Alimony received |
| e00900 | 429.54 | 573.91 | 429.54 | 144.37 | 33.6% | Sch C business net profit/loss for filing unit |
| e01100 | 7.29 | 0.00 | 7.29 | −7.29 | −100.0% | Capital gain distributions not reported on Sch D |
| e01400 | 552.47 | 0.00 | 552.47 | −552.47 | −100.0% | Taxable IRA distributions |
| e01500 | 1,641.19 | 1,046.33 | 1,641.19 | −594.86 | −36.2% | Total pensions and annuities |
| e01700 | 1,000.55 | 1,046.33 | 1,000.55 | 45.78 | 4.6% | Taxable pensions and annuities |
| e02100 | −6.52 | 66.80 | −6.52 | 73.32 | −1,124.0% | Farm net income/loss for filing unit from Sch F |
| e02300 | 23.26 | 17.17 | 23.26 | −6.09 | −26.2% | Unemployment insurance benefits |
| e02400 | 1,491.42 | 1,235.03 | 1,491.42 | −256.39 | −17.2% | Total social security (OASDI) benefits |
| e03150 | 18.52 | 0.00 | 18.52 | −18.52 | −100.0% | Total deductible IRA contributions |
| e03210 | 15.54 | 0.00 | 15.54 | −15.54 | −100.0% | Student loan interest |
| e03270 | 40.91 | 0.00 | 40.91 | −40.91 | −100.0% | Self-employed health insurance deduction |
| e03300 | 0.00 | 0.00 | 30.20 | 0.00 | NaN | Contributions to SEP, SIMPLE and qualified plans |
| e17500 | 235.04 | 993.68 | 235.04 | 758.64 | 322.8% | Itemizable medical and dental expenses. WARNING: this variable is zero below the floor in PUF data. |
| e18400 | 508.10 | 170.44 | 508.10 | −337.66 | −66.5% | Itemizable state and local income/sales taxes |
| e32800 | 29.85 | 22.75 | 29.85 | −7.10 | −23.8% | Child/dependent-care expenses for qualifying persons from Form 2441 |
| c05800 | 2,104.34 | 2,125.77 | 2,353.77 | 21.43 | 1.0% | Total (regular + AMT) income tax liability before credits (equals taxbc plus c09600) |
| taxbc | 2,101.89 | 2,122.47 | 2,323.66 | 20.58 | 1.0% | Regular tax on regular taxable income before credits |
| othertaxes | 52.58 | 69.25 | 67.33 | 16.67 | 31.7% | Other taxes: sum of niit, e09700, e09800 and e09900 (included in c09200) |
| iitax | 1,949.49 | 2,012.90 | 2,154.34 | 63.41 | 3.3% | Total federal individual income tax liability; appears as INCTAX variable in tc CLI minimal output |
| payrolltax | 1,478.31 | 1,696.71 | 1,482.10 | 218.40 | 14.8% | Total (employee + employer) payroll tax liability; appears as PAYTAX variable in tc CLI minimal output (payrolltax = ptax_was + setax + ptax_amc) |
# #| page-layout: full
# #| column: screen
ycuts <- c(-Inf, -1e-99, 0, 25e3, 50e3, 100e3, 200e3, 500e3, 1e6, Inf)
sdf2 <- sdf |>
mutate(agirange=cut(c00100, ycuts))
# count(sdf2, agirange)
tabdata <- sdf2 |>
summarise(n=n(), nret=sum(s006), taxbc=sum(taxbc * s006), .by=c(src, agirange)) |>
pivot_longer(cols= -c(src, agirange)) |>
arrange(src, name, agirange) |>
pivot_wider(names_from = c(name, src)) |>
select(agirange, starts_with(c("n_", "nret_", "taxbc_")), everything()) |>
arrange(agirange) |>
janitor::adorn_totals() |>
mutate(avgtaxbc_pe=taxbc_pe / nret_pe, avgtaxbc_td=taxbc_td / nret_td)
tabdata |>
gt() |>
tab_header(
title = html("Comparison by AGI range of Policy Engine (pe), taxdata same variables (td), and taxdata all variables (tdall)"),
subtitle = html("Tax year 2023")
) |>
tab_spanner(columns = starts_with("n_"),
label="# of records") |>
tab_spanner(columns = starts_with("nret_"),
label=html("# of returns<br>(millions)")) |>
tab_spanner(columns = starts_with("taxbc_"),
label=html("tax before credits<br>($ billions)")) |>
tab_spanner(columns = starts_with("avgtaxbc_"),
label=html("average tax before credits<br>($ dollars)")) |>
fmt_number(columns=starts_with("n_"),
decimals=0) |>
fmt_number(columns=starts_with("nret_"),
scale=1e-6,
decimals=1) |>
fmt_number(columns=starts_with("taxbc_"),
scale=1e-9,
decimals=1) |>
fmt_number(columns=starts_with("avgtaxbc_"),
scale=1,
decimals=0) | Comparison by AGI range of Policy Engine (pe), taxdata same variables (td), and taxdata all variables (tdall) | |||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|
| Tax year 2023 | |||||||||||
| agirange | # of records | # of returns (millions) |
tax before credits ($ billions) |
average tax before credits ($ dollars) |
|||||||
| n_pe | n_td | n_tdall | nret_pe | nret_td | nret_tdall | taxbc_pe | taxbc_td | taxbc_tdall | avgtaxbc_pe | avgtaxbc_td | |
| (-Inf,-1e-99] | 218 | 3,362 | 4,620 | 0.2 | 1.6 | 2.0 | 0.0 | 0.0 | 0.0 | 0 | 0 |
| (-1e-99,0] | 12,868 | 15,365 | 14,836 | 15.7 | 33.8 | 33.2 | 0.0 | 0.0 | 0.0 | 0 | 0 |
| (0,2.5e+04] | 25,219 | 52,574 | 50,632 | 47.4 | 62.4 | 61.1 | 6.4 | 5.4 | 5.4 | 134 | 86 |
| (2.5e+04,5e+04] | 27,612 | 36,602 | 35,718 | 39.3 | 30.2 | 30.0 | 74.9 | 55.0 | 54.4 | 1,904 | 1,817 |
| (5e+04,1e+05] | 37,759 | 44,821 | 43,655 | 44.1 | 38.3 | 38.4 | 277.8 | 235.7 | 232.5 | 6,293 | 6,145 |
| (1e+05,2e+05] | 19,874 | 37,925 | 36,546 | 30.5 | 28.8 | 29.3 | 588.1 | 496.8 | 490.3 | 19,312 | 17,272 |
| (2e+05,5e+05] | 15,454 | 23,228 | 20,648 | 10.4 | 11.2 | 11.7 | 567.1 | 558.5 | 569.1 | 54,548 | 50,067 |
| (5e+05,1e+06] | 5,055 | 12,946 | 12,549 | 1.4 | 1.5 | 1.8 | 195.9 | 237.5 | 276.0 | 142,159 | 160,170 |
| (1e+06, Inf] | 11,253 | 26,045 | 33,664 | 0.5 | 0.7 | 0.9 | 412.3 | 513.1 | 695.9 | 796,668 | 767,490 |
| Total | 155,312 | 252,868 | 252,868 | 189.5 | 208.4 | 208.4 | 2,122.5 | 2,101.9 | 2,323.7 | 11,198 | 10,085 |
# #| column: screen
# count(sdf, MARS)
# [1=single, 2=joint, 3=separate, 4=household-head, 5=widow(er)
mlabs <- c(single=1, joint=2, separate=3, "head of household"=4, "widow(er)"=5)
# mlabs
sdf2 <- sdf |>
mutate(mars2=factor(MARS, levels=mlabs, labels=names(mlabs)))
# count(sdf2, MARS, mars2)
tabdata <- sdf2 |>
summarise(n=n(), nret=sum(s006), taxbc=sum(taxbc * s006), .by=c(src, mars2)) |>
pivot_longer(cols= -c(src, mars2)) |>
arrange(src, name, mars2) |>
pivot_wider(names_from = c(name, src)) |>
select(mars2, starts_with(c("n_", "nret_", "taxbc_")), everything()) |>
arrange(mars2) |>
janitor::adorn_totals() |>
mutate(avgtaxbc_pe=taxbc_pe / nret_pe, avgtaxbc_td=taxbc_td / nret_td)
tabdata |>
gt() |>
tab_header(
title = html("Comparison by marital status of Policy Engine (pe), taxdata same variables (td), and taxdata all variables (tdall)"),
subtitle = html("Tax year 2023")
) |>
tab_spanner(columns = starts_with("n_"),
label="# of records") |>
tab_spanner(columns = starts_with("nret_"),
label=html("# of returns<br>(millions)")) |>
tab_spanner(columns = starts_with("taxbc_"),
label=html("tax before credits<br>($ billions)")) |>
tab_spanner(columns = starts_with("avgtaxbc_"),
label=html("average tax before credits<br>($ dollars)")) |>
fmt_number(columns=starts_with("n_"),
decimals=0) |>
fmt_number(columns=starts_with("nret_"),
scale=1e-6,
decimals=1) |>
fmt_number(columns=starts_with("taxbc_"),
scale=1e-9,
decimals=1) |>
fmt_number(columns=starts_with("avgtaxbc_"),
scale=1,
decimals=0) | Comparison by marital status of Policy Engine (pe), taxdata same variables (td), and taxdata all variables (tdall) | |||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|
| Tax year 2023 | |||||||||||
| mars2 | # of records | # of returns (millions) |
tax before credits ($ billions) |
average tax before credits ($ dollars) |
|||||||
| n_pe | n_td | n_tdall | nret_pe | nret_td | nret_tdall | taxbc_pe | taxbc_td | taxbc_tdall | avgtaxbc_pe | avgtaxbc_td | |
| single | 69,876 | 104,801 | 104,801 | 87.8 | 114.5 | 114.5 | 493.8 | 491.2 | 515.7 | 5,627 | 4,290 |
| joint | 58,522 | 119,177 | 119,177 | 67.0 | 66.6 | 66.6 | 1,489.0 | 1,456.8 | 1,642.9 | 22,230 | 21,860 |
| separate | 2,438 | 4,711 | 4,711 | 4.4 | 3.7 | 3.7 | 25.9 | 38.4 | 44.9 | 5,826 | 10,360 |
| head of household | 12,796 | 24,179 | 24,179 | 15.0 | 23.6 | 23.6 | 53.4 | 115.6 | 120.2 | 3,565 | 4,898 |
| widow(er) | 11,680 | NA | NA | 15.4 | NA | NA | 60.3 | NA | NA | 3,925 | NA |
| Total | 155,312 | 252,868 | 252,868 | 189.5 | 208.4 | 208.4 | 2,122.5 | 2,101.9 | 2,323.7 | 11,198 | 10,085 |
library(corrr)
# get top 10 variables
# ns(sdf)
topvars <- sdf |>
filter(src=="pe") |>
select(s006, starts_with("e", ignore.case=FALSE)) |>
summarise(nret=sum(s006), across(-s006, ~ sum(.x * s006))) |>
pivot_longer(-nret) |>
filter(str_sub(name, 2, 2) %in% 0:9,
str_sub(name, -1, -1) %in% 0:9) |>
arrange(desc(value)) |>
select(-nret) |>
filter(value > 0)
vars <- topvars$name[1:10]
f <- function(df){
correlate(df) |>
shave() |>
rename(var1=term) |>
pivot_longer(-var1, names_to = "var2", values_to = "corr") |>
filter(!is.na(corr))
}
cordf <- sdf |>
filter(src %in% c("td", "pe")) |>
select(src, any_of(vars)) |>
reframe(f(pick(everything())), .by=src) |>
pivot_wider(names_from = src, values_from = corr) |>
mutate(diff=pe - td) |>
arrange(desc(abs(diff))) |>
left_join(utcvars |> rename(var1=vname, desc1=desc), by = join_by(var1)) |>
left_join(utcvars |> rename(var2=vname, desc2=desc), by = join_by(var2))cordf |>
# filter(row_number() <= 10) |>
filter(abs(diff) >= 0.1) |>
gt() |>
tab_header(
title = html("Correlation pairs with absolute difference >= 0.10, taxdata (td) and Policy Engine (pe)"),
subtitle = html("Tax year 2023")
) |>
fmt_number(columns=c(td, pe, diff),
decimals=3)| Correlation pairs with absolute difference >= 0.10, taxdata (td) and Policy Engine (pe) | ||||||
|---|---|---|---|---|---|---|
| Tax year 2023 | ||||||
| var1 | var2 | td | pe | diff | desc1 | desc2 |
| e01700 | e01500 | 0.243 | 1.000 | 0.757 | Taxable pensions and annuities | Total pensions and annuities |
| e00400 | e00300 | 0.222 | −0.062 | −0.284 | Tax-exempt interest income | Taxable interest income |
| e01500 | e02400 | 0.096 | 0.266 | 0.171 | Total pensions and annuities | Total social security (OASDI) benefits |
| e17500 | e02400 | 0.123 | −0.001 | −0.125 | Itemizable medical and dental expenses. WARNING: this variable is zero below the floor in PUF data. | Total social security (OASDI) benefits |
| e00600 | e00200 | 0.093 | 0.197 | 0.103 | Ordinary dividends included in AGI | Wages, salaries, and tips for filing unit net of pension contributions |