3  Analysis

Code
source(here::here("r", "libraries.r"))
library(jsonlite)
library(tidyjson)
Code
tddir <- r"(E:\data\taxdata-psl)"

tcvars <- readRDS(here::here("data", "tcvars.rds"))
# ht(tcvars)
Code
utcvars <- tcvars |> 
  select(vname, desc) |> 
  distinct() |> 
  add_case(vname=c("n", "nret"), desc=c("# records", "# returns (millions)"))

sdf <- readRDS(path(tddir, "scratch", "tcoutput.rds")) # created in prelims.qmd
# count(sdf, src)

3.1 Comparison of weighted sums for selected variables

Code
sdf |> 
  summarise(n=n(), 
            nret=sum(s006), 
            across(
              c(c00100, e00200, e00300, e00400, e00600, e00650, e00650, e00800, e00900, e01100, e01400, 
                e01500, e01700, e02100, e02300, e02400,
                e03150, e03210, e03270, e03300, e17500, e18400, e32800,
                c05800, taxbc, othertaxes, iitax, payrolltax),
              ~sum(.x * s006)), .by=src) |> 
  pivot_longer(-src, names_to = "vname") |> 
  pivot_wider(names_from = src) |> 
  mutate(pemtd=pe - td, pdiff=pemtd / td, rn=row_number()) |> 
  left_join(utcvars, by = join_by(vname)) |> 
  gt() |> 
  cols_hide(rn) |> 
  tab_header(
    title = html("Comparison of weighted sums, selected variables, Policy Engine (pe), taxdata same variables (td), and taxdata all variables (tdall)"),
    subtitle = html("Tax year 2023, dollar amounts in billions")
  ) |>  
  cols_label(tdall=html("taxdata (tdall) - all variables"),
             td=html("taxdata (td)"),
             pe=html("Policy Engine (pe)"),
             pemtd=html("Policy Engine minus taxdata (pemtd)"),
             pdiff=html("Policy Engine % difference from taxdata")) |> 
  fmt_number(columns=c(tdall, td, pe, pemtd),
             rows=1,
             decimals=0) |> 
  fmt_number(columns=c(tdall, td, pe, pemtd),
             rows=2,
             scale=1e-6,
             decimals=2) |> 
  fmt_number(columns=c(tdall, td, pe, pemtd),
             rows=rn > 2,
             scale=1e-9,
             decimals=2) |> 
  fmt_percent(columns=pdiff,
              decimals=1)
Comparison of weighted sums, selected variables, Policy Engine (pe), taxdata same variables (td), and taxdata all variables (tdall)
Tax year 2023, dollar amounts in billions
vname taxdata (td) Policy Engine (pe) taxdata (tdall) - all variables Policy Engine minus taxdata (pemtd) Policy Engine % difference from taxdata desc
n 252,868 155,312 252,868 −97,556 −38.6% # records
nret 208.42 189.54 208.42 −18.88 −9.1% # returns (millions)
c00100 14,680.95 15,820.46 15,655.28 1,139.51 7.8% Adjusted Gross Income (AGI)
e00200 10,310.66 10,928.44 10,310.66 617.78 6.0% Wages, salaries, and tips for filing unit net of pension contributions
e00300 137.18 213.66 137.18 76.47 55.7% Taxable interest income
e00400 94.74 201.72 94.74 106.98 112.9% Tax-exempt interest income
e00600 405.99 504.30 405.99 98.31 24.2% Ordinary dividends included in AGI
e00650 285.10 324.38 285.10 39.28 13.8% Qualified dividends included in ordinary dividends
e00800 19.39 2.64 19.39 −16.75 −86.4% Alimony received
e00900 429.54 573.91 429.54 144.37 33.6% Sch C business net profit/loss for filing unit
e01100 7.29 0.00 7.29 −7.29 −100.0% Capital gain distributions not reported on Sch D
e01400 552.47 0.00 552.47 −552.47 −100.0% Taxable IRA distributions
e01500 1,641.19 1,046.33 1,641.19 −594.86 −36.2% Total pensions and annuities
e01700 1,000.55 1,046.33 1,000.55 45.78 4.6% Taxable pensions and annuities
e02100 −6.52 66.80 −6.52 73.32 −1,124.0% Farm net income/loss for filing unit from Sch F
e02300 23.26 17.17 23.26 −6.09 −26.2% Unemployment insurance benefits
e02400 1,491.42 1,235.03 1,491.42 −256.39 −17.2% Total social security (OASDI) benefits
e03150 18.52 0.00 18.52 −18.52 −100.0% Total deductible IRA contributions
e03210 15.54 0.00 15.54 −15.54 −100.0% Student loan interest
e03270 40.91 0.00 40.91 −40.91 −100.0% Self-employed health insurance deduction
e03300 0.00 0.00 30.20 0.00 NaN Contributions to SEP, SIMPLE and qualified plans
e17500 235.04 993.68 235.04 758.64 322.8% Itemizable medical and dental expenses. WARNING: this variable is zero below the floor in PUF data.
e18400 508.10 170.44 508.10 −337.66 −66.5% Itemizable state and local income/sales taxes
e32800 29.85 22.75 29.85 −7.10 −23.8% Child/dependent-care expenses for qualifying persons from Form 2441
c05800 2,104.34 2,125.77 2,353.77 21.43 1.0% Total (regular + AMT) income tax liability before credits (equals taxbc plus c09600)
taxbc 2,101.89 2,122.47 2,323.66 20.58 1.0% Regular tax on regular taxable income before credits
othertaxes 52.58 69.25 67.33 16.67 31.7% Other taxes: sum of niit, e09700, e09800 and e09900 (included in c09200)
iitax 1,949.49 2,012.90 2,154.34 63.41 3.3% Total federal individual income tax liability; appears as INCTAX variable in tc CLI minimal output
payrolltax 1,478.31 1,696.71 1,482.10 218.40 14.8% Total (employee + employer) payroll tax liability; appears as PAYTAX variable in tc CLI minimal output (payrolltax = ptax_was + setax + ptax_amc)

3.2 Comparison of weighted sums by AGI range for selected variables

Code
# #| page-layout: full
# #| column: screen

ycuts <- c(-Inf, -1e-99, 0, 25e3, 50e3, 100e3, 200e3, 500e3, 1e6, Inf)
sdf2 <- sdf |> 
  mutate(agirange=cut(c00100, ycuts))
# count(sdf2, agirange)
  
tabdata <- sdf2 |> 
  summarise(n=n(), nret=sum(s006), taxbc=sum(taxbc * s006), .by=c(src, agirange)) |> 
  pivot_longer(cols= -c(src, agirange)) |> 
  arrange(src, name, agirange) |> 
  pivot_wider(names_from = c(name, src)) |> 
  select(agirange, starts_with(c("n_", "nret_", "taxbc_")), everything()) |> 
  arrange(agirange) |> 
  janitor::adorn_totals() |> 
  mutate(avgtaxbc_pe=taxbc_pe / nret_pe, avgtaxbc_td=taxbc_td / nret_td)

tabdata |> 
  gt() |> 
  tab_header(
    title = html("Comparison by AGI range of Policy Engine (pe), taxdata same variables (td), and taxdata all variables (tdall)"),
    subtitle = html("Tax year 2023")
  ) |>
  tab_spanner(columns = starts_with("n_"),
              label="# of records") |> 
  tab_spanner(columns = starts_with("nret_"),
              label=html("# of returns<br>(millions)")) |> 
  tab_spanner(columns = starts_with("taxbc_"),
              label=html("tax before credits<br>($ billions)")) |>   
  tab_spanner(columns = starts_with("avgtaxbc_"),
              label=html("average tax before credits<br>($ dollars)")) |> 
  fmt_number(columns=starts_with("n_"),
             decimals=0) |> 
  fmt_number(columns=starts_with("nret_"),
             scale=1e-6,
             decimals=1) |> 
  fmt_number(columns=starts_with("taxbc_"),
             scale=1e-9,
             decimals=1) |> 
  fmt_number(columns=starts_with("avgtaxbc_"),
             scale=1,
             decimals=0) 
Comparison by AGI range of Policy Engine (pe), taxdata same variables (td), and taxdata all variables (tdall)
Tax year 2023
agirange # of records # of returns
(millions)
tax before credits
($ billions)
average tax before credits
($ dollars)
n_pe n_td n_tdall nret_pe nret_td nret_tdall taxbc_pe taxbc_td taxbc_tdall avgtaxbc_pe avgtaxbc_td
(-Inf,-1e-99] 218 3,362 4,620 0.2 1.6 2.0 0.0 0.0 0.0 0 0
(-1e-99,0] 12,868 15,365 14,836 15.7 33.8 33.2 0.0 0.0 0.0 0 0
(0,2.5e+04] 25,219 52,574 50,632 47.4 62.4 61.1 6.4 5.4 5.4 134 86
(2.5e+04,5e+04] 27,612 36,602 35,718 39.3 30.2 30.0 74.9 55.0 54.4 1,904 1,817
(5e+04,1e+05] 37,759 44,821 43,655 44.1 38.3 38.4 277.8 235.7 232.5 6,293 6,145
(1e+05,2e+05] 19,874 37,925 36,546 30.5 28.8 29.3 588.1 496.8 490.3 19,312 17,272
(2e+05,5e+05] 15,454 23,228 20,648 10.4 11.2 11.7 567.1 558.5 569.1 54,548 50,067
(5e+05,1e+06] 5,055 12,946 12,549 1.4 1.5 1.8 195.9 237.5 276.0 142,159 160,170
(1e+06, Inf] 11,253 26,045 33,664 0.5 0.7 0.9 412.3 513.1 695.9 796,668 767,490
Total 155,312 252,868 252,868 189.5 208.4 208.4 2,122.5 2,101.9 2,323.7 11,198 10,085

3.3 Comparison of weighted sums by marital status for selected variables

Code
# #| column: screen

# count(sdf, MARS)
#  [1=single, 2=joint, 3=separate, 4=household-head, 5=widow(er)
mlabs <- c(single=1, joint=2, separate=3, "head of household"=4, "widow(er)"=5)
# mlabs

sdf2 <- sdf |> 
  mutate(mars2=factor(MARS, levels=mlabs, labels=names(mlabs)))
# count(sdf2, MARS, mars2)
  
tabdata <- sdf2 |> 
  summarise(n=n(), nret=sum(s006), taxbc=sum(taxbc * s006), .by=c(src, mars2)) |> 
  pivot_longer(cols= -c(src, mars2)) |> 
  arrange(src, name, mars2) |> 
  pivot_wider(names_from = c(name, src)) |> 
  select(mars2, starts_with(c("n_", "nret_", "taxbc_")), everything()) |> 
  arrange(mars2) |> 
  janitor::adorn_totals() |> 
  mutate(avgtaxbc_pe=taxbc_pe / nret_pe, avgtaxbc_td=taxbc_td / nret_td)

tabdata |> 
  gt() |> 
  tab_header(
    title = html("Comparison by marital status of Policy Engine (pe), taxdata same variables (td), and taxdata all variables (tdall)"),
    subtitle = html("Tax year 2023")
  ) |>
  tab_spanner(columns = starts_with("n_"),
              label="# of records") |> 
  tab_spanner(columns = starts_with("nret_"),
              label=html("# of returns<br>(millions)")) |> 
  tab_spanner(columns = starts_with("taxbc_"),
              label=html("tax before credits<br>($ billions)")) |>   
  tab_spanner(columns = starts_with("avgtaxbc_"),
              label=html("average tax before credits<br>($ dollars)")) |> 
  fmt_number(columns=starts_with("n_"),
             decimals=0) |> 
  fmt_number(columns=starts_with("nret_"),
             scale=1e-6,
             decimals=1) |> 
  fmt_number(columns=starts_with("taxbc_"),
             scale=1e-9,
             decimals=1) |> 
  fmt_number(columns=starts_with("avgtaxbc_"),
             scale=1,
             decimals=0) 
Comparison by marital status of Policy Engine (pe), taxdata same variables (td), and taxdata all variables (tdall)
Tax year 2023
mars2 # of records # of returns
(millions)
tax before credits
($ billions)
average tax before credits
($ dollars)
n_pe n_td n_tdall nret_pe nret_td nret_tdall taxbc_pe taxbc_td taxbc_tdall avgtaxbc_pe avgtaxbc_td
single 69,876 104,801 104,801 87.8 114.5 114.5 493.8 491.2 515.7 5,627 4,290
joint 58,522 119,177 119,177 67.0 66.6 66.6 1,489.0 1,456.8 1,642.9 22,230 21,860
separate 2,438 4,711 4,711 4.4 3.7 3.7 25.9 38.4 44.9 5,826 10,360
head of household 12,796 24,179 24,179 15.0 23.6 23.6 53.4 115.6 120.2 3,565 4,898
widow(er) 11,680 NA NA 15.4 NA NA 60.3 NA NA 3,925 NA
Total 155,312 252,868 252,868 189.5 208.4 208.4 2,122.5 2,101.9 2,323.7 11,198 10,085

3.4 Correlations

Code
library(corrr)

# get top 10 variables
# ns(sdf)
topvars <- sdf |> 
  filter(src=="pe") |> 
  select(s006, starts_with("e", ignore.case=FALSE)) |> 
  summarise(nret=sum(s006), across(-s006, ~ sum(.x * s006))) |> 
  pivot_longer(-nret) |> 
  filter(str_sub(name, 2, 2) %in% 0:9,
         str_sub(name, -1, -1) %in% 0:9) |> 
  arrange(desc(value)) |> 
  select(-nret) |> 
  filter(value > 0)

vars <- topvars$name[1:10]


f <- function(df){
  correlate(df) |> 
    shave() |> 
    rename(var1=term) |> 
    pivot_longer(-var1, names_to = "var2", values_to = "corr") |> 
    filter(!is.na(corr))
}

cordf <- sdf |> 
  filter(src %in% c("td", "pe")) |> 
  select(src, any_of(vars)) |> 
  reframe(f(pick(everything())), .by=src) |> 
  pivot_wider(names_from = src, values_from = corr) |> 
  mutate(diff=pe - td) |> 
  arrange(desc(abs(diff))) |> 
  left_join(utcvars |> rename(var1=vname, desc1=desc), by = join_by(var1)) |> 
  left_join(utcvars |> rename(var2=vname, desc2=desc), by = join_by(var2))
Code
cordf |> 
  # filter(row_number() <= 10) |> 
  filter(abs(diff) >= 0.1) |> 
  gt() |>  
  tab_header(
    title = html("Correlation pairs with absolute difference >= 0.10, taxdata (td) and Policy Engine (pe)"),
    subtitle = html("Tax year 2023")
  ) |>
  fmt_number(columns=c(td, pe, diff),
             decimals=3)
Correlation pairs with absolute difference >= 0.10, taxdata (td) and Policy Engine (pe)
Tax year 2023
var1 var2 td pe diff desc1 desc2
e01700 e01500 0.243 1.000 0.757 Taxable pensions and annuities Total pensions and annuities
e00400 e00300 0.222 −0.062 −0.284 Tax-exempt interest income Taxable interest income
e01500 e02400 0.096 0.266 0.171 Total pensions and annuities Total social security (OASDI) benefits
e17500 e02400 0.123 −0.001 −0.125 Itemizable medical and dental expenses. WARNING: this variable is zero below the floor in PUF data. Total social security (OASDI) benefits
e00600 e00200 0.093 0.197 0.103 Ordinary dividends included in AGI Wages, salaries, and tips for filing unit net of pension contributions