replace markdown pipeline with goquery DOM walker for html mode

- Strip/simplify HTML directly instead of HTML→markdown→HTML round-trip
- Preserves tables, forms, lists, and HTML4 attributes (bgcolor, width, etc)
- Resolve relative URLs for links and images
- Add proper user-agent header for image downloads
- Parallel image downloads with sync.WaitGroup
- SVG rendering support via oksvg/rasterx
- Set correct width/height attributes on resized images
- Print media emulation to strip print-hidden elements
- Remove h2m and goldmark dependencies

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Antoni Sawicki
2026-03-13 00:34:50 -07:00
parent 916b5ee1b7
commit 64c2a16085
3 changed files with 231 additions and 176 deletions

11
go.mod
View File

@@ -3,22 +3,20 @@ module github.com/tenox7/wrp
go 1.26
require (
github.com/JohannesKaufmann/html-to-markdown v1.6.0
github.com/MaxHalford/halfgone v0.0.0-20171017091812-482157b86ccb
github.com/PuerkitoBio/goquery v1.11.0
github.com/breml/rootcerts v0.3.4
github.com/chromedp/cdproto v0.0.0-20250803210736-d308e07a266d
github.com/chromedp/chromedp v0.14.2
github.com/ericpauley/go-quantize v0.0.0-20200331213906-ae555eb2afa4
github.com/lithammer/shortuuid/v4 v4.2.0
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646
github.com/soniakeys/quant v1.0.0
github.com/tenox7/gip v1.0.1
github.com/yuin/goldmark v1.7.16
golang.org/x/image v0.36.0
golang.org/x/net v0.51.0
)
require (
github.com/PuerkitoBio/goquery v1.11.0 // indirect
github.com/andybalholm/cascadia v1.3.3 // indirect
github.com/chromedp/sysutil v1.1.0 // indirect
github.com/go-json-experiment/json v0.0.0-20260214004413-d219187c3433 // indirect
@@ -26,7 +24,8 @@ require (
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.4.0 // indirect
github.com/google/uuid v1.6.0 // indirect
golang.org/x/net v0.51.0 // indirect
github.com/srwiley/oksvg v0.0.0-20221011165216-be6e8873101c // indirect
github.com/srwiley/rasterx v0.0.0-20220730225603-2ab79fcdd4ef // indirect
golang.org/x/sys v0.42.0 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
golang.org/x/text v0.34.0 // indirect
)

70
go.sum
View File

@@ -1,35 +1,19 @@
github.com/JohannesKaufmann/html-to-markdown v1.6.0 h1:04VXMiE50YYfCfLboJCLcgqF5x+rHJnb1ssNmqpLH/k=
github.com/JohannesKaufmann/html-to-markdown v1.6.0/go.mod h1:NUI78lGg/a7vpEJTz/0uOcYMaibytE4BUOQS8k78yPQ=
github.com/MaxHalford/halfgone v0.0.0-20171017091812-482157b86ccb h1:YQ+d0g0P0F/06oDoeEgDHeZCIrnKgLxXcqYOpe8sTuU=
github.com/MaxHalford/halfgone v0.0.0-20171017091812-482157b86ccb/go.mod h1:J86XzS1wgzJPjpQmpriJ+SetP17JSQUd9l+HWQK86jA=
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo=
github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y=
github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw=
github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
github.com/breml/rootcerts v0.3.1 h1:PTO35OcW58K2ZYtdBykCsZh9k/eRd57bY65EHrKK/xA=
github.com/breml/rootcerts v0.3.1/go.mod h1:S/PKh+4d1HUn4HQovEB8hPJZO6pUZYrIhmXBhsegfXw=
github.com/breml/rootcerts v0.3.4 h1:9i7WNl/ctd9OEAOaTfLy//Wrlfxq/tRQ7v4okYFN9Ys=
github.com/breml/rootcerts v0.3.4/go.mod h1:S/PKh+4d1HUn4HQovEB8hPJZO6pUZYrIhmXBhsegfXw=
github.com/chromedp/cdproto v0.0.0-20250803210736-d308e07a266d h1:ZtA1sedVbEW7EW80Iz2GR3Ye6PwbJAJXjv7D74xG6HU=
github.com/chromedp/cdproto v0.0.0-20250803210736-d308e07a266d/go.mod h1:NItd7aLkcfOA/dcMXvl8p1u+lQqioRMq/SqDp71Pb/k=
github.com/chromedp/chromedp v0.14.1 h1:0uAbnxewy/Q+Bg7oafVePE/6EXEho9hnaC38f+TTENg=
github.com/chromedp/chromedp v0.14.1/go.mod h1:rHzAv60xDE7VNy/MYtTUrYreSc0ujt2O1/C3bzctYBo=
github.com/chromedp/chromedp v0.14.2 h1:r3b/WtwM50RsBZHMUm9fsNhhzRStTHrKdr2zmwbZSzM=
github.com/chromedp/chromedp v0.14.2/go.mod h1:rHzAv60xDE7VNy/MYtTUrYreSc0ujt2O1/C3bzctYBo=
github.com/chromedp/sysutil v1.1.0 h1:PUFNv5EcprjqXZD9nJb9b/c9ibAbxiYo4exNWZyipwM=
github.com/chromedp/sysutil v1.1.0/go.mod h1:WiThHUdltqCNKGc4gaU50XgYjwjYIhKWoHGPTUfWTJ8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/ericpauley/go-quantize v0.0.0-20200331213906-ae555eb2afa4 h1:BBade+JlV/f7JstZ4pitd4tHhpN+w+6I+LyOS7B4fyU=
github.com/ericpauley/go-quantize v0.0.0-20200331213906-ae555eb2afa4/go.mod h1:H7chHJglrhPPzetLdzBleF8d22WYOv7UM/lEKYiwlKM=
github.com/go-json-experiment/json v0.0.0-20250725192818-e39067aee2d2 h1:iizUGZ9pEquQS5jTGkh4AqeeHCMbfbjeb0zMt0aEFzs=
github.com/go-json-experiment/json v0.0.0-20250725192818-e39067aee2d2/go.mod h1:TiCD2a1pcmjd7YnhGH0f/zKNcCD06B029pHhzV23c2M=
github.com/go-json-experiment/json v0.0.0-20250813024750-ebf49471dced h1:Q311OHjMh/u5E2TITc++WlTP5We0xNseRMkHDyvhW7I=
github.com/go-json-experiment/json v0.0.0-20250813024750-ebf49471dced/go.mod h1:TiCD2a1pcmjd7YnhGH0f/zKNcCD06B029pHhzV23c2M=
github.com/go-json-experiment/json v0.0.0-20260214004413-d219187c3433 h1:vymEbVwYFP/L05h5TKQxvkXoKxNvTpjxYKdF1Nlwuao=
github.com/go-json-experiment/json v0.0.0-20260214004413-d219187c3433/go.mod h1:tphK2c80bpPhMOI4v6bIc2xWywPfbqi1Z06+RcrMkDg=
github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
@@ -41,11 +25,6 @@ github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakr
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
github.com/lithammer/shortuuid/v4 v4.2.0 h1:LMFOzVB3996a7b8aBuEXxqOBflbfPQAiVzkIcHO0h8c=
@@ -54,38 +33,19 @@ github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw=
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y=
github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
github.com/soniakeys/quant v1.0.0 h1:N1um9ktjbkZVcywBVAAYpZYSHxEfJGzshHCxx/DaI0Y=
github.com/soniakeys/quant v1.0.0/go.mod h1:HI1k023QuVbD4H8i9YdfZP2munIHU4QpjsImz6Y6zds=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/srwiley/oksvg v0.0.0-20221011165216-be6e8873101c h1:km8GpoQut05eY3GiYWEedbTT0qnSxrCjsVbb7yKY1KE=
github.com/srwiley/oksvg v0.0.0-20221011165216-be6e8873101c/go.mod h1:cNQ3dwVJtS5Hmnjxy6AgTPd0Inb3pW05ftPSX7NZO7Q=
github.com/srwiley/rasterx v0.0.0-20220730225603-2ab79fcdd4ef h1:Ch6Q+AZUxDBCVqdkI8FSpFyZDtCVBc2VmejdNrm5rRQ=
github.com/srwiley/rasterx v0.0.0-20220730225603-2ab79fcdd4ef/go.mod h1:nXTWP6+gD5+LUJ8krVhhoeHjvHTutPxMYl5SvkcnJNE=
github.com/tenox7/gip v1.0.1 h1:yRcHROzwBjV2BhCjnh1y19wIg5Ei5CTMaZ+lx9nMl3Q=
github.com/tenox7/gip v1.0.1/go.mod h1:MR/eaUKjLGkYIguDcAUrWyxG58ipjjCrzM92jwGqDno=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
github.com/yuin/goldmark v1.7.1/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E=
github.com/yuin/goldmark v1.7.13 h1:GPddIs617DnBLFFVJFgpo1aBfe/4xcvMc3SB5t/D0pA=
github.com/yuin/goldmark v1.7.13/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg=
github.com/yuin/goldmark v1.7.16 h1:n+CJdUxaFMiDUNnWC3dMWCIQJSkxH4uz3ZwQBkAlVNE=
github.com/yuin/goldmark v1.7.16/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M=
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
golang.org/x/image v0.29.0 h1:HcdsyR4Gsuys/Axh0rDEmlBmB68rW1U9BUdB3UVHsas=
golang.org/x/image v0.29.0/go.mod h1:RVJROnf3SLK8d26OW91j4FrIHGbsJ8QnbEocVTOWQDA=
golang.org/x/image v0.30.0 h1:jD5RhkmVAnjqaCUXfbGBrn3lpxbknfN9w2UhHHU+5B4=
golang.org/x/image v0.30.0/go.mod h1:SAEUTxCCMWSrJcCy/4HwavEsfZZJlYxeHLc6tTiAe/c=
golang.org/x/image v0.36.0 h1:Iknbfm1afbgtwPTmHnS2gTM/6PPZfH+z2EFuOkSbqwc=
golang.org/x/image v0.36.0/go.mod h1:YsWD2TyyGKiIX1kZlu9QfKIsQ4nAAK9bdgdrIsE7xy4=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
@@ -97,17 +57,11 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8=
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs=
golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8=
golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE=
golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg=
golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo=
golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -124,28 +78,20 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA=
golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI=
golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk=
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
@@ -157,6 +103,8 @@ golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk=
golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
@@ -164,9 +112,3 @@ golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=

326
shtml.go
View File

@@ -1,23 +1,6 @@
// WRP TXT / Simple HTML Mode Routines
// WRP Simple HTML Mode Routines
package main
// TODO:
// - add image processing times counter to the footer
// - img cache w/garbage collector / test back/button behavior in old browsers
// - add referer header
// - svg support
// - incorrect cert support in both markdown and image download
// - unify cdp and txt image handlers
// - use goroutiness to process images
// - get inner html from chromedp instead of html2markdown
//
// - BUG: DomainFromURL always prefixes with http instead of https
// reproduces on vsi vms docs
// - BUG: markdown table errors
// reproduces on hacker news
// - BUG: captcha errors using html to markdown, perhaps use cdp inner html + downloaded images
// reproduces on https://www.cnn.com/cnn-underscored/electronics
import (
"bytes"
"encoding/base64"
@@ -30,24 +13,22 @@ import (
"io"
"log"
"net/http"
"net/url"
"strconv"
"strings"
"sync"
"time"
h2m "github.com/JohannesKaufmann/html-to-markdown"
"github.com/JohannesKaufmann/html-to-markdown/plugin"
"github.com/PuerkitoBio/goquery"
"github.com/chromedp/cdproto/emulation"
"github.com/chromedp/chromedp"
"github.com/lithammer/shortuuid/v4"
"github.com/nfnt/resize"
"github.com/srwiley/oksvg"
"github.com/srwiley/rasterx"
"github.com/tenox7/gip"
"github.com/yuin/goldmark"
"github.com/yuin/goldmark/ast"
"github.com/yuin/goldmark/extension"
"github.com/yuin/goldmark/parser"
"github.com/yuin/goldmark/text"
"github.com/yuin/goldmark/util"
"golang.org/x/image/webp"
"golang.org/x/net/html"
)
var imgStor imageStore
@@ -91,69 +72,108 @@ func (i *imageStore) del(id string) {
delete(i.img, id)
}
func fetchImage(id, url, imgType string, maxSize, imgOpt int) (int, error) {
log.Printf("Downloading IMGZ URL=%q for ID=%q", url, id)
func fetchImage(id, imgURL, imgType string, maxSize, imgOpt int) (int, int, int, error) {
log.Printf("Downloading IMGZ URL=%q for ID=%q", imgURL, id)
var in []byte
var err error
switch url[:4] {
if len(imgURL) < 4 {
return 0, 0, 0, fmt.Errorf("image URL too short: %q", imgURL)
}
switch imgURL[:4] {
case "http":
req, err := http.NewRequest("GET", url, nil)
req, err := http.NewRequest("GET", imgURL, nil)
if err != nil {
return 0, fmt.Errorf("Error creating request for %q: %v", url, err)
return 0, 0, 0, fmt.Errorf("Error creating request for %q: %v", imgURL, err)
}
if *userAgent != "" {
req.Header.Set("User-Agent", *userAgent)
}
r, err := http.DefaultClient.Do(req)
if err != nil {
return 0, fmt.Errorf("Error downloading %q: %v", url, err)
return 0, 0, 0, fmt.Errorf("Error downloading %q: %v", imgURL, err)
}
if r.StatusCode != http.StatusOK {
return 0, fmt.Errorf("Error %q HTTP Status Code: %v", url, r.StatusCode)
return 0, 0, 0, fmt.Errorf("Error %q HTTP Status Code: %v", imgURL, r.StatusCode)
}
defer r.Body.Close()
in, err = io.ReadAll(r.Body)
if err != nil {
return 0, fmt.Errorf("Error reading %q: %v", url, err)
return 0, 0, 0, fmt.Errorf("Error reading %q: %v", imgURL, err)
}
case "data":
idx := strings.Index(url, ",")
idx := strings.Index(imgURL, ",")
if idx < 1 {
return 0, fmt.Errorf("image is embeded but unable to find coma: %q", url)
return 0, 0, 0, fmt.Errorf("image is embeded but unable to find coma: %q", imgURL)
}
in, err = base64.StdEncoding.DecodeString(url[idx+1:])
in, err = base64.StdEncoding.DecodeString(imgURL[idx+1:])
if err != nil {
return 0, fmt.Errorf("error decoding image from url embed: %q: %v", url, err)
return 0, 0, 0, fmt.Errorf("error decoding image from url embed: %q: %v", imgURL, err)
}
default:
return 0, 0, 0, fmt.Errorf("unsupported image URL scheme: %q", imgURL)
}
out, err := smallImg(in, imgType, maxSize, imgOpt)
out, w, h, err := smallImg(in, imgType, maxSize, imgOpt)
if err != nil {
return 0, fmt.Errorf("Error scaling down image: %v", err)
return 0, 0, 0, fmt.Errorf("Error scaling down %q: %v", imgURL, err)
}
imgStor.add(id, url, out)
return len(out), nil
imgStor.add(id, imgURL, out)
return len(out), w, h, nil
}
func smallImg(src []byte, imgType string, maxSize, imgOpt int) ([]byte, error) {
func decodeSVG(src []byte, maxSize int) (image.Image, error) {
icon, err := oksvg.ReadIconStream(bytes.NewReader(src))
if err != nil {
return nil, err
}
w, h := int(icon.ViewBox.W), int(icon.ViewBox.H)
if w <= 0 || h <= 0 {
w, h = maxSize, maxSize
}
if w > maxSize || h > maxSize {
ratio := float64(maxSize) / float64(max(w, h))
w = int(float64(w) * ratio)
h = int(float64(h) * ratio)
}
if w <= 0 {
w = 1
}
if h <= 0 {
h = 1
}
icon.SetTarget(0, 0, float64(w), float64(h))
rgba := image.NewRGBA(image.Rect(0, 0, w, h))
icon.Draw(rasterx.NewDasher(w, h, rasterx.NewScannerGV(w, h, rgba, rgba.Bounds())), 1)
return rgba, nil
}
func isSVG(src []byte) bool {
s := strings.TrimSpace(string(src[:min(len(src), 256)]))
return strings.HasPrefix(s, "<svg") || strings.HasPrefix(s, "<?xml")
}
func smallImg(src []byte, imgType string, maxSize, imgOpt int) ([]byte, int, int, error) {
t := http.DetectContentType(src)
var err error
var img image.Image
switch t {
case "image/png":
switch {
case t == "image/png":
img, err = png.Decode(bytes.NewReader(src))
case "image/gif":
case t == "image/gif":
img, err = gif.Decode(bytes.NewReader(src))
case "image/jpeg":
case t == "image/jpeg":
img, err = jpeg.Decode(bytes.NewReader(src))
case "image/webp":
case t == "image/webp":
img, err = webp.Decode(bytes.NewReader(src))
default: // TODO: also add svg
case t == "image/svg+xml", isSVG(src):
img, err = decodeSVG(src, maxSize)
default:
err = errors.New("unknown content type: " + t)
}
if err != nil {
return nil, fmt.Errorf("image decode problem: %v", err)
return nil, 0, 0, fmt.Errorf("image decode problem: %v", err)
}
img = resize.Thumbnail(uint(maxSize), uint(maxSize), img, resize.NearestNeighbor)
b := img.Bounds()
var outBuf bytes.Buffer
switch imgType {
case "gip":
@@ -166,87 +186,181 @@ func smallImg(src []byte, imgType string, maxSize, imgOpt int) ([]byte, error) {
err = jpeg.Encode(&outBuf, img, &jpeg.Options{Quality: imgOpt})
}
if err != nil {
return nil, fmt.Errorf("gif encode problem: %v", err)
return nil, 0, 0, fmt.Errorf("image encode problem: %v", err)
}
return outBuf.Bytes(), nil
return outBuf.Bytes(), b.Dx(), b.Dy(), nil
}
type astTransformer struct {
imgType string
maxSize int
imgOpt int
totSize int
var removeElements = []string{
"script", "style", "link", "meta", "noscript", "iframe",
"svg", "canvas", "video", "audio", "source", "picture",
"template", "slot", "dialog", "portal",
}
func (t *astTransformer) Transform(node *ast.Document, reader text.Reader, pc parser.Context) {
ast.Walk(node, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
if link, ok := n.(*ast.Link); ok && entering {
link.Destination = append([]byte("/?m=html&t="+t.imgType+"&s="+strconv.Itoa(t.maxSize)+"&url="), link.Destination...)
}
if img, ok := n.(*ast.Image); ok && entering {
var imgExt string
if t.imgType == "gip" {
imgExt = "gif"
} else {
imgExt = t.imgType
var renameToDiv = map[string]bool{
"section": true, "article": true, "nav": true,
"header": true, "footer": true, "aside": true,
"main": true, "figure": true, "figcaption": true,
"details": true, "summary": true, "hgroup": true,
"mark": true, "time": true, "search": true,
}
var keepAttrs = map[string]bool{
"href": true, "src": true, "alt": true, "title": true,
"width": true, "height": true, "border": true,
"cellpadding": true, "cellspacing": true,
"bgcolor": true, "background": true,
"align": true, "valign": true,
"colspan": true, "rowspan": true, "nowrap": true,
"name": true, "value": true, "type": true,
"action": true, "method": true, "enctype": true,
"size": true, "maxlength": true,
"checked": true, "selected": true, "multiple": true,
"disabled": true, "readonly": true,
"placeholder": true, "for": true,
"rows": true, "cols": true,
"color": true, "face": true,
}
func resolveURL(raw string, base *url.URL) string {
if raw == "" {
return ""
}
ref, err := url.Parse(raw)
if err != nil {
return raw
}
return base.ResolveReference(ref).String()
}
func simplifyDOM(doc *goquery.Document, rq *wrpReq) int {
doc.Find(strings.Join(removeElements, ", ")).Remove()
doc.Find("*").Each(func(i int, s *goquery.Selection) {
for _, n := range s.Nodes {
if n.Type != html.ElementNode {
return
}
seq := shortuuid.New() + "." + imgExt
size, err := fetchImage(seq, string(img.Destination), t.imgType, t.maxSize, t.imgOpt) // TODO: use goroutines with waitgroup
if err != nil {
log.Print(err)
n.Parent().RemoveChildren(n)
return ast.WalkContinue, nil
if renameToDiv[n.Data] {
n.Data = "div"
}
img.Destination = []byte(imgZpfx + seq)
t.totSize += size
var keep []html.Attribute
for _, a := range n.Attr {
if keepAttrs[a.Key] {
keep = append(keep, a)
}
}
n.Attr = keep
}
return ast.WalkContinue, nil
})
}
func (rq *wrpReq) captureMarkdown() {
log.Printf("Processing Markdown conversion request for %v", rq.url)
var outerHTML string
err := chromedp.Run(ctx, waitForRender(), chromedp.OuterHTML("html", &outerHTML, chromedp.ByQuery))
if err != nil {
log.Printf("Failed to get OuterHTML via CDP: %v", err)
http.Error(rq.w, err.Error(), http.StatusInternalServerError)
return
imgExt := rq.imgType
if imgExt == "gip" {
imgExt = "gif"
}
log.Printf("Got %v bytes HTML from CDP for %v", len(outerHTML), rq.url)
c := h2m.NewConverter(h2m.DomainFromURL(rq.url), true, nil)
c.Use(plugin.GitHubFlavored())
md, err := c.ConvertString(outerHTML)
if err != nil {
http.Error(rq.w, err.Error(), http.StatusInternalServerError)
return
}
log.Printf("Got %v bytes md from %v", len(md), rq.url)
var imgOpt int
switch rq.imgType {
case "jpg":
imgOpt = int(rq.jQual)
case "gif":
imgOpt = int(rq.nColors)
case "gip":
imgOpt = 0
}
t := &astTransformer{imgType: rq.imgType, maxSize: int(rq.maxSize), imgOpt: imgOpt}
gm := goldmark.New(
goldmark.WithExtensions(extension.GFM),
goldmark.WithParserOptions(parser.WithASTTransformers(util.Prioritized(t, 100))),
baseURL, _ := url.Parse(rq.url)
wrpParams := fmt.Sprintf("m=html&t=%s&s=%d", rq.imgType, rq.maxSize)
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if !exists || href == "" {
return
}
if strings.HasPrefix(href, "#") || strings.HasPrefix(href, "javascript:") {
return
}
abs := resolveURL(href, baseURL)
s.SetAttr("href", "/?"+wrpParams+"&url="+url.QueryEscape(abs))
})
type imgJob struct {
sel *goquery.Selection
seq string
abs string
}
var jobs []imgJob
doc.Find("img[src]").Each(func(i int, s *goquery.Selection) {
src, exists := s.Attr("src")
if !exists || src == "" {
s.Remove()
return
}
abs := resolveURL(src, baseURL)
seq := shortuuid.New() + "." + imgExt
jobs = append(jobs, imgJob{sel: s, seq: seq, abs: abs})
})
var wg sync.WaitGroup
var mu sync.Mutex
var totSize int
for i := range jobs {
wg.Add(1)
go func(j imgJob) {
defer wg.Done()
size, w, h, err := fetchImage(j.seq, j.abs, rq.imgType, int(rq.maxSize), imgOpt)
mu.Lock()
defer mu.Unlock()
if err != nil {
log.Print(err)
j.sel.Remove()
return
}
j.sel.SetAttr("src", imgZpfx+j.seq)
j.sel.SetAttr("width", strconv.Itoa(w))
j.sel.SetAttr("height", strconv.Itoa(h))
totSize += size
}(jobs[i])
}
wg.Wait()
return totSize
}
func (rq *wrpReq) captureMarkdown() {
log.Printf("Processing simple HTML conversion for %v", rq.url)
var outerHTML string
err := chromedp.Run(ctx,
waitForRender(),
emulation.SetEmulatedMedia().WithMedia("print"),
chromedp.Evaluate(`(function(){document.querySelectorAll('*').forEach(function(e){if(getComputedStyle(e).display==='none')e.remove()})})()`, nil),
chromedp.OuterHTML("html", &outerHTML, chromedp.ByQuery),
emulation.SetEmulatedMedia().WithMedia(""),
)
var ht bytes.Buffer
err = gm.Convert([]byte(md), &ht)
if err != nil {
log.Printf("Failed to get OuterHTML via CDP: %v", err)
http.Error(rq.w, err.Error(), http.StatusInternalServerError)
return
}
log.Printf("Got %v bytes HTML from CDP for %v", len(outerHTML), rq.url)
doc, err := goquery.NewDocumentFromReader(strings.NewReader(outerHTML))
if err != nil {
log.Printf("Failed to parse HTML: %v", err)
http.Error(rq.w, err.Error(), http.StatusInternalServerError)
return
}
totSize := simplifyDOM(doc, rq)
body := doc.Find("body")
simplified, err := body.Html()
if err != nil {
http.Error(rq.w, err.Error(), http.StatusInternalServerError)
return
}
log.Printf("Rendered %v bytes html for %v", len(ht.String()), rq.url)
log.Printf("Simplified to %v bytes html for %v", len(simplified), rq.url)
rq.printUI(uiParams{
text: string(asciify([]byte(ht.String()))),
text: string(asciify([]byte(simplified))),
bgColor: "#FFFFFF",
imgSize: fmt.Sprintf("%.0f KB", float32(t.totSize)/1024.0),
imgSize: fmt.Sprintf("%.0f KB", float32(totSize)/1024.0),
})
}