imgchest/model/
scraped_post.rs

1use once_cell::sync::Lazy;
2use scraper::Html;
3use scraper::Selector;
4
5static APP_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("#app").unwrap());
6
7/// An error that may occur while parsing a post
8#[derive(Debug, thiserror::Error)]
9pub enum FromHtmlError {
10    #[error("missing {0}")]
11    MissingElement(&'static str),
12
13    #[error("missing attribute {0}")]
14    MissingAttribute(&'static str),
15
16    #[error("invalid data page")]
17    InvalidDataPage(serde_json::Error),
18}
19
20/// A Post
21#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
22pub struct ScrapedPost {
23    /// The id of the post
24    pub id: Box<str>,
25
26    /// The title of the post
27    pub title: Box<str>,
28
29    /// The author of the post
30    pub username: Box<str>,
31
32    // /// The post privacy
33    // pub privacy: String,
34
35    // /// ?
36    // pub report_status: u32,
37    /// The number of views
38    pub views: u64,
39
40    /// Whether this is nsfw
41    pub nsfw: bool,
42
43    /// The number of images
44    pub image_count: u64,
45
46    // /// The timestamp of post creation
47    // pub created: String,
48    /// Post images
49    pub images: Box<[File]>,
50}
51
52impl ScrapedPost {
53    /// Parse this from html
54    pub(crate) fn from_html(html: &Html) -> Result<Self, FromHtmlError> {
55        // Implement:
56        // JSON.parse(document.getElementById('app').getAttribute('data-page'))
57        let app_element = html
58            .select(&APP_SELECTOR)
59            .next()
60            .ok_or(FromHtmlError::MissingElement("app div"))?;
61        let data_page_attr = app_element
62            .attr("data-page")
63            .ok_or(FromHtmlError::MissingAttribute("data-page"))?;
64        let page_data: PageData =
65            serde_json::from_str(data_page_attr).map_err(FromHtmlError::InvalidDataPage)?;
66
67        // Overflowing a u64 with image entries is impossible.
68        let image_count = u64::try_from(page_data.props.post.files.len()).unwrap();
69        let images: Vec<_> = page_data
70            .props
71            .post
72            .files
73            .into_iter()
74            .map(|file| File {
75                id: file.id,
76                description: file.description,
77                link: file.link,
78                position: file.position,
79            })
80            .collect();
81        Ok(Self {
82            id: page_data.props.post.slug,
83            title: page_data.props.post.title,
84            username: page_data.props.post.user.username,
85            views: page_data.props.post.views,
86            nsfw: page_data.props.post.nsfw != 0,
87            image_count,
88            images: images.into(),
89        })
90    }
91}
92
93#[derive(Debug, serde::Deserialize)]
94struct PageData {
95    props: PageDataProps,
96}
97
98#[derive(Debug, serde::Deserialize)]
99struct PageDataProps {
100    post: PageDataPost,
101}
102
103#[derive(Debug, serde::Deserialize)]
104struct PageDataPost {
105    files: Vec<PageDataFile>,
106    nsfw: u8,
107    slug: Box<str>,
108    title: Box<str>,
109    user: PageDataUser,
110    views: u64,
111}
112
113#[derive(Debug, serde::Deserialize)]
114struct PageDataUser {
115    username: Box<str>,
116}
117
118#[derive(Debug, serde::Deserialize)]
119struct PageDataFile {
120    id: Box<str>,
121    description: Option<Box<str>>,
122    link: Box<str>,
123    position: u32,
124}
125
126/// A post file
127#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
128pub struct File {
129    /// The file id
130    pub id: Box<str>,
131
132    /// The file description
133    pub description: Option<Box<str>>,
134
135    /// The file link
136    pub link: Box<str>,
137
138    /// The position of the image in the post.
139    ///
140    /// Starts at 1.
141    pub position: u32,
142    // /// The file creation time
143    // pub created: u32,
144}