Skip to content
Snippets Groups Projects
Commit b197e090 authored by Janne Mareike Koschinski's avatar Janne Mareike Koschinski Committed by Janne Mareike Koschinski
Browse files

Improved oEmbed and og/twitter parsing

parent c2c3f90b
Branches
No related tags found
No related merge requests found
vendor/
.idea/
*.iml
......@@ -23,9 +23,25 @@ func returnJson(w http.ResponseWriter, data interface{}) error {
return nil
}
type OEmbed struct {
Url string `json:"url"`
AuthorName string `json:"author_name"`
AuthorUrl string `json:"author_url"`
Html string `json:"html"`
Width int `json:"width"`
Height int `json:"height"`
Type string `json:"type"`
CacheAge string `json:"cache_age"`
ProviderName string `json:"provider_name"`
ProviderUrl string `json:"provider_url"`
Version string `json:"version"`
}
type InternalData struct {
url string
oembed_url string
twitter_card string
twitter_site string
twitter_site_id string
......@@ -108,6 +124,8 @@ type Data struct {
Text string `json:"text"`
Fields []DataField `json:"fields"`
ImageUrl string `json:"image_url"`
LargeImage bool `json:"large_image"`
Types []string `json:"types"`
ThumbUrl string `json:"thumb_url"`
Footer string `json:"footer"`
FooterIcon string `json:"footer_icon"`
......@@ -124,7 +142,7 @@ func coalesce(list []string) string {
return ""
}
func buildData(in InternalData) (out Data) {
func buildData(in InternalData, oEmbed OEmbed) (out Data) {
base, _ := url.ParseRequestURI(in.url)
resolve := func(path string) string {
if path == "" || base == nil {
......@@ -142,9 +160,11 @@ func buildData(in InternalData) (out Data) {
out.TitleLink = in.url
out.Title = coalesce([]string{in.twitter_title, in.og_title, in.meta_title, in.title})
out.Text = coalesce([]string{in.twitter_description, in.og_description, in.meta_description})
out.AuthorName = coalesce([]string{in.meta_author, in.og_site_name})
out.AuthorLink = resolve(coalesce([]string{in.article_author, in.link_author}))
out.AuthorName = coalesce([]string{oEmbed.AuthorName, in.meta_author})
out.AuthorLink = resolve(coalesce([]string{oEmbed.AuthorUrl, in.article_author, in.link_author}))
out.Color = coalesce([]string{in.meta_theme_color})
out.Footer = coalesce([]string{oEmbed.ProviderName, in.og_site_name})
out.FooterIcon = resolve(coalesce([]string{in.link_favicon}))
var largeImages []string
var smallImages []string
......@@ -156,212 +176,245 @@ func buildData(in InternalData) (out Data) {
largeImages = append(largeImages, in.og_image)
smallImages = append(smallImages, in.link_favicon)
out.ThumbUrl = resolve(coalesce(smallImages))
out.ImageUrl = resolve(coalesce(largeImages))
return
largeTypes := []string{
"video",
"video.other",
"article",
"summary_large_image",
"player",
}
func main() {
client := &http.Client{}
isLargeImage := func (format string) bool {
for _, t := range largeTypes {
if t == format {
return true
}
}
return false
}
loadData := func(url string) {
fmt.Printf("Searching for %s\n", url)
resp, err := client.Get(url)
if err != nil {
if err != nil {
panic(err)
out.LargeImage = isLargeImage(oEmbed.Type) || isLargeImage(in.og_type) || isLargeImage(in.twitter_card)
types := []string{}
for _, format := range []string{oEmbed.Type, in.og_type, in.twitter_card} {
if format != "" {
types = append(types, format)
}
return
}
out.Types = types
internalData := InternalData{
url: url,
out.ThumbUrl = resolve(coalesce(smallImages))
out.ImageUrl = resolve(coalesce(largeImages))
return
}
matchers := map[string]func(string){}
matchers["meta/twitter:card"] = func(content string) {
func main() {
client := &http.Client{}
matchers := map[string]func(*InternalData, string, string){}
matchers["meta/twitter:card"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_card = content
}
matchers["meta/twitter:site"] = func(content string) {
matchers["meta/twitter:site"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_site = content
}
matchers["meta/twitter:site:id"] = func(content string) {
matchers["meta/twitter:site:id"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_site_id = content
}
matchers["meta/twitter:creator"] = func(content string) {
matchers["meta/twitter:creator"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_creator = content
}
matchers["meta/twitter:creator:id"] = func(content string) {
matchers["meta/twitter:creator:id"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_creator_id = content
}
matchers["meta/twitter:description"] = func(content string) {
matchers["meta/twitter:description"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_description = content
}
matchers["meta/twitter:title"] = func(content string) {
matchers["meta/twitter:title"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_title = content
}
matchers["meta/twitter:image"] = func(content string) {
matchers["meta/twitter:image"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_image = content
}
matchers["meta/twitter:image:src"] = func(content string) {
matchers["meta/twitter:image:src"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_image = content
}
matchers["meta/twitter:image:alt"] = func(content string) {
matchers["meta/twitter:image:alt"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_image_alt = content
}
matchers["meta/twitter:player"] = func(content string) {
matchers["meta/twitter:player"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_player = content
}
matchers["meta/twitter:player:width"] = func(content string) {
matchers["meta/twitter:player:width"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_player_width = content
}
matchers["meta/twitter:player:height"] = func(content string) {
matchers["meta/twitter:player:height"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_player_height = content
}
matchers["meta/twitter:player:stream"] = func(content string) {
matchers["meta/twitter:player:stream"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_player_stream = content
}
matchers["meta/twitter:app:name:iphone"] = func(content string) {
matchers["meta/twitter:app:name:iphone"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_app_name_iphone = content
}
matchers["meta/twitter:app:id:iphone"] = func(content string) {
matchers["meta/twitter:app:id:iphone"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_app_id_iphone = content
}
matchers["meta/twitter:app:url:iphone"] = func(content string) {
matchers["meta/twitter:app:url:iphone"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_app_url_iphone = content
}
matchers["meta/twitter:app:name:ipad"] = func(content string) {
matchers["meta/twitter:app:name:ipad"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_app_name_ipad = content
}
matchers["meta/twitter:app:id:ipad"] = func(content string) {
matchers["meta/twitter:app:id:ipad"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_app_id_ipad = content
}
matchers["meta/twitter:app:url:ipad"] = func(content string) {
matchers["meta/twitter:app:url:ipad"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_app_url_ipad = content
}
matchers["meta/twitter:app:name:googleplay"] = func(content string) {
matchers["meta/twitter:app:name:googleplay"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_app_name_googleplay = content
}
matchers["meta/twitter:app:id:googleplay"] = func(content string) {
matchers["meta/twitter:app:id:googleplay"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_app_id_googleplay = content
}
matchers["meta/twitter:app:url:googleplay"] = func(content string) {
matchers["meta/twitter:app:url:googleplay"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_app_url_googleplay = content
}
matchers["meta/twitter:label1"] = func(content string) {
matchers["meta/twitter:label1"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_label1 = content
}
matchers["meta/twitter:data1"] = func(content string) {
matchers["meta/twitter:data1"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_data1 = content
}
matchers["meta/twitter:label2"] = func(content string) {
matchers["meta/twitter:label2"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_label2 = content
}
matchers["meta/twitter:data2"] = func(content string) {
matchers["meta/twitter:data2"] = func(internalData *InternalData, content string, extra string) {
internalData.twitter_data2 = content
}
matchers["meta/og:url"] = func(content string) {
matchers["meta/og:url"] = func(internalData *InternalData, content string, extra string) {
internalData.og_url = content
}
matchers["meta/og:title"] = func(content string) {
matchers["meta/og:title"] = func(internalData *InternalData, content string, extra string) {
internalData.og_title = content
}
matchers["meta/og:description"] = func(content string) {
matchers["meta/og:description"] = func(internalData *InternalData, content string, extra string) {
internalData.og_description = content
}
matchers["meta/og:type"] = func(content string) {
matchers["meta/og:type"] = func(internalData *InternalData, content string, extra string) {
internalData.og_type = content
}
matchers["meta/og:locale"] = func(content string) {
matchers["meta/og:locale"] = func(internalData *InternalData, content string, extra string) {
internalData.og_locale = content
}
matchers["meta/og:video"] = func(content string) {
matchers["meta/og:video"] = func(internalData *InternalData, content string, extra string) {
internalData.og_video = content
}
matchers["meta/og:video:url"] = func(content string) {
matchers["meta/og:video:url"] = func(internalData *InternalData, content string, extra string) {
internalData.og_video_url = content
}
matchers["meta/og:video:secure_url"] = func(content string) {
matchers["meta/og:video:secure_url"] = func(internalData *InternalData, content string, extra string) {
internalData.og_video_secure_url = content
}
matchers["meta/og:video:type"] = func(content string) {
matchers["meta/og:video:type"] = func(internalData *InternalData, content string, extra string) {
internalData.og_video_type = content
}
matchers["meta/og:video:width"] = func(content string) {
matchers["meta/og:video:width"] = func(internalData *InternalData, content string, extra string) {
internalData.og_video_width = content
}
matchers["meta/og:video:height"] = func(content string) {
matchers["meta/og:video:height"] = func(internalData *InternalData, content string, extra string) {
internalData.og_video_height = content
}
matchers["meta/og:image"] = func(content string) {
matchers["meta/og:image"] = func(internalData *InternalData, content string, extra string) {
internalData.og_image = content
}
matchers["meta/og:image:url"] = func(content string) {
matchers["meta/og:image:url"] = func(internalData *InternalData, content string, extra string) {
internalData.og_image_url = content
}
matchers["meta/og:image:secure_url"] = func(content string) {
matchers["meta/og:image:secure_url"] = func(internalData *InternalData, content string, extra string) {
internalData.og_image_secure_url = content
}
matchers["meta/og:image:type"] = func(content string) {
matchers["meta/og:image:type"] = func(internalData *InternalData, content string, extra string) {
internalData.og_image_type = content
}
matchers["meta/og:image:width"] = func(content string) {
matchers["meta/og:image:width"] = func(internalData *InternalData, content string, extra string) {
internalData.og_image_width = content
}
matchers["meta/og:image:height"] = func(content string) {
matchers["meta/og:image:height"] = func(internalData *InternalData, content string, extra string) {
internalData.og_image_height = content
}
matchers["meta/og:audio"] = func(content string) {
matchers["meta/og:audio"] = func(internalData *InternalData, content string, extra string) {
internalData.og_audio = content
}
matchers["meta/og:audio:url"] = func(content string) {
matchers["meta/og:audio:url"] = func(internalData *InternalData, content string, extra string) {
internalData.og_audio_url = content
}
matchers["meta/og:audio:secure_url"] = func(content string) {
matchers["meta/og:audio:secure_url"] = func(internalData *InternalData, content string, extra string) {
internalData.og_audio_secure_url = content
}
matchers["meta/og:audio:type"] = func(content string) {
matchers["meta/og:audio:type"] = func(internalData *InternalData, content string, extra string) {
internalData.og_audio_type = content
}
matchers["meta/og:site_name"] = func(content string) {
matchers["meta/og:site_name"] = func(internalData *InternalData, content string, extra string) {
internalData.og_site_name = content
}
matchers["meta/article:author"] = func(content string) {
matchers["meta/article:author"] = func(internalData *InternalData, content string, extra string) {
internalData.article_author = content
}
matchers["meta/article:published_time"] = func(content string) {
matchers["meta/article:published_time"] = func(internalData *InternalData, content string, extra string) {
internalData.article_published_time = content
}
matchers["meta/title"] = func(content string) {
matchers["meta/title"] = func(internalData *InternalData, content string, extra string) {
internalData.meta_title = content
}
matchers["meta/description"] = func(content string) {
matchers["meta/description"] = func(internalData *InternalData, content string, extra string) {
internalData.meta_description = content
}
matchers["meta/author"] = func(content string) {
matchers["meta/author"] = func(internalData *InternalData, content string, extra string) {
internalData.meta_author = content
}
matchers["meta/theme-color"] = func(content string) {
matchers["meta/theme-color"] = func(internalData *InternalData, content string, extra string) {
internalData.meta_theme_color = content
}
matchers["link/icon"] = func(content string) {
matchers["link/icon"] = func(internalData *InternalData, content string, extra string) {
internalData.link_favicon = content
}
matchers["link/author"] = func(content string) {
matchers["link/author"] = func(internalData *InternalData, content string, extra string) {
internalData.link_author = content
}
matchers["title"] = func(content string) {
matchers["link/alternate"] = func(internalData *InternalData, content string, extra string) {
if extra == "application/json+oembed" {
internalData.oembed_url = content
}
}
matchers["title"] = func(internalData *InternalData, content string, extra string) {
internalData.title = content
}
loadData := func(url string) (Data, error) {
fmt.Printf("Searching for %s\n", url)
var data Data
resp, err := client.Get(url)
if err != nil {
return data, err
}
internalData := InternalData{
url: url,
}
var parseNode func(*html.Node)
parseNode = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "meta" {
......@@ -373,7 +426,7 @@ func main() {
if name != "" {
matcher := matchers["meta/"+name]
if matcher != nil {
matcher(attrs["content"])
matcher(&internalData, attrs["content"], "")
}
}
}
......@@ -386,7 +439,7 @@ func main() {
for _, name := range names {
matcher := matchers["link/"+name]
if matcher != nil {
matcher(attrs["href"])
matcher(&internalData, attrs["href"], attrs["type"])
}
}
}
......@@ -395,7 +448,7 @@ func main() {
if c != nil && c.Type == html.TextNode {
matcher := matchers["title"]
if matcher != nil {
matcher(c.Data)
matcher(&internalData, c.Data, "")
}
}
}
......@@ -404,36 +457,57 @@ func main() {
}
}
contentType := strings.SplitN(resp.Header.Get("Content-Type"), ";", 2)[0]
if contentType == "text/html" ||
contentType == "application/xhtml+xml" ||
contentType == "application/xhtml" ||
contentType == "application/xml" {
doc, _ := html.Parse(resp.Body)
parseNode(doc)
err = resp.Body.Close()
if err != nil {
panic(err)
}
}
data := buildData(internalData)
var oEmbedData OEmbed
marshalled, err := json.Marshal(data)
if err != nil {
panic(err)
if internalData.oembed_url != "" {
fmt.Printf("Searching for %s\n", internalData.oembed_url)
resp, err := client.Get(internalData.oembed_url)
if err == nil {
err = json.NewDecoder(resp.Body).Decode(&oEmbedData)
}
resp.Body.Close()
}
fmt.Printf("%+v\n", internalData)
fmt.Println(string(marshalled))
data = buildData(internalData, oEmbedData)
return data, err
}
/*
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
loadData(strings.TrimSpace(r.URL.Query().Get("url")))
url := strings.TrimSpace(r.URL.Query().Get("url"))
if url != "" {
data, err := loadData(url)
if err != nil {
panic(err.Error())
}
err = returnJson(w, data)
if err != nil {
panic(err)
}
}
})
err := http.ListenAndServe(":8080", nil)
if err != nil {
panic(err)
}
*/
/*
loadData("https://medium.com/slack-developer-blog/everything-you-ever-wanted-to-know-about-unfurling-but-were-afraid-to-ask-or-how-to-make-your-e64b4bb9254")
loadData("http://harvard.edu")
loadData("https://twitter.com/dw_politik/status/1092872739445104640")
loadData("https://twitter.com/raketenlurch/status/1093991675209416704")
*/
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment