Which subreddit deletes the most comment? Is there censorship?

If you've been a long time reddit user you've probably realized that some subreddits are filled with deleted comments. Out of curiosity, I decided to do a quick analysis on just how many comments are deleted. To get started let's look at the result.

Result

Looking at the top 10 subreddit gives us a good baseline for how many comments are usually deleted because of the larger sample size they provide. Interestingly r/science has an absurd amount of deleted comments. In the second chart we see that the average amount of deleted comments is 0.23% without r/science. That number will be useful for the next chart.

In the last chart I picked some interesting subreddits to study. The green line shows the average amount that was taken from the previous chart. Something I noticed is that style of moderating is a bigger influence on the deletion than the nature of the sub. For example we can see AskMen (0.11%) vs AskReddit (0.2%) vs AskWomen (2.53%), and relationship_advice (0.38%) vs relationships (1.32%).

Methodology

All data were pulled from the top 30 posts in each subreddit over 7 days in an ad-hoc manner. All were done at least 24 hours apart to prevent any overlap.

Source Code

I'm releasing the source code so everyone can perform their own analysis. It's written in JavaScript and needs to be run in the browser on reddit. The main functions to run are right at the bottom.


function getPosts(subreddit, callback) {
  fetch('https://gateway.reddit.com/desktopapi/v1/subreddits/' + subreddit + '?&sort=hot')
    .then(response => {
      return response.json()
    })
    .then(data => {
      let ids = Object.keys(data.posts)
      callback(ids)
    })
    .catch(err => {
      console.error(err)
    })
}

function getComments(id, callback) {
  fetch('https://gateway.reddit.com/desktopapi/v1/postcomments/' + id + '?sort=top&depth=100&limit=100000')
    .then(response => {
      return response.json()
    })
    .then(data => {
      let stats = {
        total: 0,
        deleted: 0
      }
      for (let id in data.comments) {
        let comment = data.comments[id]
        stats.total++
        if (comment.deletedBy == 'moderator') {
          stats.deleted++
        }
      }
      callback(stats)
    })
    .catch(err => {
      console.error(err)
    })

}

function analyze(subreddit, callback) {
  getPosts(subreddit, ids => {
    var total = 0,
      deleted = 0,
      counted = 0
    for (let id of ids) {
      getComments(id, stats => {
        total += stats.total
        deleted += stats.deleted
        counted++
        if (counted == ids.length) {
          callback(subreddit, deleted, total);
        }
      })
    }
  })
}

function analyzeToJSON(subs) {
  var dict = {}

  function loop(i) {
    if (i >= subs.length) {
      console.log(JSON.stringify(dict));
      return
    }
    analyze(subs[i], (sub, deleted, total) => {
      dict[sub] = {
        deleted: deleted,
        total: total
      }
      loop(++i)
    })
  }
  loop(0)
}

function analyzeToCSV(subs) {
  let csv = ''

  function loop(i) {
    if (i >= subs.length) {
      console.log(csv);
      return
    }
    analyze(subs[i], (sub, deleted, total) => {
      csv += `${sub},${deleted},${total},${Math.round(deleted / total * 10000, 2) / 100}%\n`;
      loop(++i)
    })
  }
  csv += 'sub,deleted,total,percent\n';
  loop(0)
}

function compile(entries) {
  let results = {}
  for (let entry of entries) {
    for (let sub in entry) {
      if (!results[sub]) {
        results[sub] = {
          deleted: 0,
          total: 0,
        }
      }
      let r = results[sub]
      let e = entry[sub]
      r.deleted += e.deleted
      r.total += e.total
    }
  }

  for (let sub in results) {
    let r = results[sub]
    r.percent = r.deleted / r.total
  }

  return results
}

function getMean(entries) {
  let total = 0,
    count = 0
  for (let sub in entries) {
    total += entries[sub].percent
    count++
  }
  return total / count
}

function getMedian(entries) {
  let list = []
  for (let sub in entries) {
    list.push(entries[sub].percent)
  }
  list.sort((a, b) => a - b)
  if (list.length % 2 === 0) {
    let a = list.length / 2,
      b = a - 1
    return (list[a] + list[b]) / 2
  } else {
    return list[(list.length - 1) / 2]
  }
}

function getStdDiv(entries) {
  let list = []
  let mean = getMean(entries)
  for (let sub in entries) {
    let d = entries[sub].percent - mean
    list.push(Math.sqrt(d * d))
  }
  return list.reduce((accumulator, currentValue) => accumulator + currentValue) / list.length
}

function getCSV(data) {
  let csv = 'sub,deleted,total,percent\n'
  for (let sub in data) {
    let r = data[sub]
    csv += `${sub},${r.deleted},${r.total},${Math.round(r.percent * 10000, 2) / 100}%\n`;
  }
  return csv
}

function formatPercent(i) {
  return Math.round(i * 10000, 2) / 100 + '%'
}

function report(entries) {
  const percents = Object.values(entries).map(e => e.percent)
  const max = formatPercent(Math.max.apply(null, percents))
  const min = formatPercent(Math.min.apply(null, percents))
  console.log(`Mean: ${formatPercent(getMean(entries))}, Median: ${formatPercent(getMedian(entries))}, Standard Deviation: ${formatPercent(getStdDiv(entries))}, Min: ${min}, Max: ${max}`);
  console.log(getCSV(entries));
}

var presets = {
  top10: [
    'funny', 'AskReddit', 'gaming', 'pics', 'science', 'worldnews', 'todayilearned', 'aww', 'movies', 'videos'
  ],
  custom: [
    'AskReddit', 'AskMen', 'AskWomen', 'TwoXChromosomes', 'sex', 'relationships', 'relationship_advice',
    'unpopularopinion', 'news', 'politics', 'The_Donald', 'MGTOW', 'AmItheAsshole',
  ],
}

var results = {
  top10: [{
      "funny": {
        "deleted": 3,
        "total": 5053
      },
      "AskReddit": {
        "deleted": 34,
        "total": 8687
      },
      "gaming": {
        "deleted": 2,
        "total": 3954
      },
      "pics": {
        "deleted": 4,
        "total": 3691
      },
      "science": {
        "deleted": 195,
        "total": 2786
      },
      "worldnews": {
        "deleted": 30,
        "total": 5580
      },
      "todayilearned": {
        "deleted": 1,
        "total": 4889
      },
      "aww": {
        "deleted": 5,
        "total": 3133
      },
      "movies": {
        "deleted": 10,
        "total": 5129
      },
      "videos": {
        "deleted": 1,
        "total": 4038
      }
    }, // add more results here
  ],
  custom: [{
      "AskReddit": {
        "deleted": 34,
        "total": 8723
      },
      "AskMen": {
        "deleted": 0,
        "total": 1626
      },
      "AskWomen": {
        "deleted": 37,
        "total": 2351
      },
      "TwoXChromosomes": {
        "deleted": 67,
        "total": 1872
      },
      "sex": {
        "deleted": 9,
        "total": 549
      },
      "relationships": {
        "deleted": 12,
        "total": 1276
      },
      "relationship_advice": {
        "deleted": 12,
        "total": 2684
      },
      "unpopularopinion": {
        "deleted": 5,
        "total": 2233
      },
      "news": {
        "deleted": 20,
        "total": 3252
      },
      "politics": {
        "deleted": 30,
        "total": 7723
      },
      "The_Donald": {
        "deleted": 21,
        "total": 3069
      },
      "MGTOW": {
        "deleted": 1,
        "total": 850
      },
      "AmItheAsshole": {
        "deleted": 30,
        "total": 8131
      }
    }, // add more results here
  ],
}

// analyzeToJSON(presets.top10)
// analyzeToCSV(presets.custom)
// report(compile(results.top10))
// report(compile(results.custom))

Keywords: reddit, subreddit, comments, censorship

How to Compile Go Code 40% Faster With RAM Disk

Go is already famous for its impressive compilation speed. But if you came from a scripting language with practically no compile time, you're probably not satisfied. Here's the compile time after running go build main.go a few times to warm up the file system.


real	0m2.590s
user	0m2.685s
sys	 0m0.775s

It's easily 2 to 3 times slower when the files aren't cached. Which could happen if your disk is experiencing a lot of usage. Here's the compile time when compiling from RAM disk. A whopping 40% faster; almost a second off.


real	0m1.871s
user	0m2.124s
sys	 0m0.380s

Here's the bash script to get things working:


#!/bin/sh

if [ ! -d ~/ramdisk ]; then
mkdir ~/ramdisk
fi
sudo mount -t tmpfs -o size=512M tmpfs ~/ramdisk
rsync -ah ~/go ~/ramdisk/
rsync -ah --exclude '.git' ~/path/to/project ~/ramdisk
export GOPATH=$HOME/ramdisk/go

This creates a directory under the home folder as ~/ramdisk. Then assigns 512MB disk space and mounts it on the RAM. The rsync calls copy all Go files and project files to the RAM disk. Finally, it sets GOPATH to the new Go path under ~/ramdisk.

The next step is to reflect all file changes to the RAM disk instead of editing the files directly on it. This way you don't have to worry about losing your work. To do that we need a tool to watch for file changes and automatically duplicate the file. You can use any tool you like e.g. inotify, fswatch, nodemon etc. I'm going to use xnotify, a high level tool which can help with the build process.


./xnotify --verbose -i . --base /home/vagrant/ramdisk/project --batch 50 -- go build cmd/main.go -- ./main | xargs -L 1 ./copy.sh

copy.sh:


#!/bin/sh

NAME=$2
SRC=/path/to/project/$NAME
if [ -f $SRC ]; then
echo Copying: $NAME
cp $SRC ~/ramdisk/project/$NAME
fi

The command above basically copies the file to the RAM disk and runs go build cmd/main.go && ./main when a file changes. Now if we want to stop using the RAM disk we just need to run this script:


#!/bin/sh

sudo lsof -n ~/ramdisk
sudo umount ~/ramdisk
rm ~/ramdisk -r
export GOPATH=$HOME/go

Keywords: go, golang, compile, speed, ram, disk, ramdisk

Theory of Cost Driven Testing

Purpose

In this article we shall explore guidelines that can help us reap the benefits of automated software testing with the least effort and time possible. There's always a trade off between writing bug free software and delivering results quickly. Thanks to the law of diminishing returns, we know that every additional effort produces less returns than before. This is especially true in startups and small dev teams with tight deadlines. Why are some software "bug free" despite having no automated test? Why do some software have many bugs despite high code coverage? Can we write tests that we are very likely to benefit from while not writing any test that is unlikely to help us?

High Cost Features

High cost features are those that must be working correctly or there will be dire consequences. A bug in the feature could have catastrophic consequences such as losing your job, losing money, someone dying etc. This should always be the first to have automated testing. Thorough testing is highly recommended for expected and unexpected values since any bug could have potentially disastrous effect. Everything else can come later.

We have to be strict when deciding whether a feature is critical, or we might end up saying that 90% of the code is critical. Would a bug cause the app to crash? That's not critical. Would it make customer angry? That's not critical since it can be used as an excuse for every bug.

High Frequency Code

Once all critical features are fully tested, we can move on to testing code that are more likely to have bugs that affect the user. This would be features that are used most frequently such as the start screen, login page etc. Rather than spending a day testing obscure features that most users would not use, we can use it to test features that users will most definitely be using. Such examples would be about screens and settings page. The simple reason behind this is to prioritize bugs that would affect 90% of the users over the ones affecting 10% of the users.

Since frequent usage increase the chance and impact of bug, it stands to reason that we should prioritize testing with data that users are more likely to enter. That means we should have lower priority for testing edge cases and bad input. Or simply not test them at all. For example, a user who enters his email in a date field might cause a horrifying error page, but he can easily rectify this by trying again and hopefully realizing this mistake.

Bug Prone Code

Next we can move on to bug prone areas starting from the most likely to the least likely. Some of the factors that increase the chance of bugs are: amount of custom code, number inputs and outputs, logical complexity etc. That usually means testing high level functions such as controllers, instead of low level utility functions. This will let you get more code coverage with less tests, and also cover more flows, inputs and outputs. Functions that are made up of mostly third party party code will have lower priority since most of it are already well tested. It is quite unnecessary to test functions that are simple and infrequently used since they are 1) less likely to be buggy, 2) less likely to be affected by code changes, 3) much easier to achieve 0 bug.

That's All

Since the goal is to write as little test as possible, any additional test means we waste more time and increase our technical debt. Hopefully with these guidelines we can write only 20% of the tests yet achieve 80% of the benefits of testing.

Keywords: unit testing, software testing, automated testing, programming

KPop Star Rain & Mediacorp Celebrities Red Carpet at Cartier - Ion

The Cartier flagship store at Ion Orchard has officially reopened and is now the largest boutique in South East Asia. There were special appearances by KPop star Rain and local celebrities Zoe Tay, Fann Wong, Christopher Lee, Rui En, Rebecca Lim and Sheila Sim.

Keywords: rain, mediacorp, celebrity, cartier, ion, opening, photography

Tiger Sugar Opening at Capitol Piazza

Popular Taichung brand Tiger Sugar has finally come to Singapore with the first store opening at Capitol Piazza. Creating the drink requires around 10 staff and 3 hours. The brand's emphasis on quality and authenticity extends to even the type of ice it uses. The shape of ice is similar to the ones typically drunk with whisky. The new store opened with a bang and a long queue that lasted over an hour.

Keywords: tiger sugar, opening, ceremony, event, lion dance, photographer

AIG Sentosa Golf Club and Tangling Rugby Club 2018

Some examples of official photographs from AIG Sentosa Golf Club and Tangling Rugby Club events held on 16 November and 17 November 2018.

Keywords: sports, youth, acitivity, photographer, event, photography, aig, golf, rubgy

Event Photography Portfolio 2018

Keywords: singapore, photographer, portfolio

Cosplay Convention Photos 2018

All photos were captured during Singapore Toy, Game & Comic Convention 2018. They were captured spontaneously while roaming through the crowds and while covering the events happening. Check out my other works to see what I can offer you.

Keywords: cosplay, event, photo, photgrapher, singapore, exhibition, stgxx

What Permission To Set For User File Upload Directory (Web Folder)

TLDR:


sudo chmod u=rwX,g=rwXs,o=rX -R path/to/folder
sudo chown www-data:www-data path/to/folder
sudo adduser username www-data

Introduction

If you've ever ran into a problem where your web app/server (Apache or NGINX) cannot read or write from a web folder (user upload directory), it's probably because of the file permissions. Unfortunately most people recommend setting it to 777, giving full access to everyone. Even some of the top answers on Stackoverflow recommend this. Let's see why this is a bad idea, and what's the correct way.

The Problem

To see why 777 is dangerous, we need to understand what's happening. It's giving read, write and execute access to owner, group and public! Let's imagine a common scenario. You have a PHP app that lets user upload to http://website.com/uploads/their-file. They upload a PHP file, virus.php, to http://website.com/uploads/virus.php. They visit that page… You're dead, because you've given execute access.

The Solution

u=rwX,g=rwXs,o=rX

I'm assuming you're familiar with chmod format. There are 2 letters which you might not be familiar with. X with a capital, means give execute access to directories, but not to files. This allows you to perform ls, but will not let you execute any script or program. s is a sticky bit that causes new directories inside to have the same owner. Useful when creating a new directory under a different user e.g. cron or ssh. In this case, we want new directories to belong to www-data group so that NGINX can access those files even if it was created by other users.

And of course remember to add your user to www-data group so that you will be able to access them through ssh or cron.

Keywords: permission, web, file, folder, directory, upload, linux, chmod, chown

Singapore Party Photographer For Hire

Have a launch party? Or maybe a birthday party? Or even a wedding anniversary or company party. You'll want to capture beautiful and fun moments to be remembered and shared. It takes patience and quick reaction to capture fun moments. That takes experience. Take a look at some of the events I've shot.

Keywords: party, launch, celebration, birthday, company, anniversary, photographer
Avatar

Contact